1 # ===================================================================== 2 # _userencode_2.awk: unixify a TW page name string. 3 # 4 # Copyright (c) 2007-2011 Carlo Strozzi 5 # 6 # This program is free software; you can redistribute it and/or modify 7 # it under the terms of the GNU General Public License as published by 8 # the Free Software Foundation; version 2 dated June, 1991. 9 # 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with this program; if not, write to the Free Software 17 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 # 19 # ===================================================================== 20 # string _userencode_2(string s [,int maxlen]) 21 # 22 # Note: to handle locales where most characters are non-ASCII (such as 23 # Cyrillic, etc.), this algorithm uses _asciify(s), otherwise the 24 # resulting strings would be made mostly of "x" . 25 # ===================================================================== 26 27 function _userencode_2(s,maxlen, tmp,a,i,j,e) { 28 29 # Handle strongly non-ASCII languages. Add more as needed. 30 if (_LANG[ENVIRON["CSA_LANG"]] ~ /^(ru|zh)/) { 31 if (s ~ /[^.]\.[^.]/) { 32 tmp = s 33 sub(/\..*/,_NULL,s) 34 sub(/[^.]+\./,_NULL,tmp) 35 s = _asciify(s) "." _asciify(tmp) 36 } 37 38 else s = _asciify(s) 39 40 # This will make the result non-unasciifiable, but it will 41 # retain backward-compatibility. Since unasciifiability is 42 # lost anyway, I also remove unnecessary hyphens to get more 43 # compact page names. 44 gsub(/-2[0d]/,"\001",s) 45 gsub(/-/,_NULL,s) 46 gsub(/\001/,"-",s) 47 } 48 49 # Note that mawks's tolower() and toupper() will convert only 50 # US-ASCII characters and leave anything else unchanged, but that's 51 # fine here, since non US-ASCII stuff will always have to be changed 52 # into "x" anyway. 53 54 s = tolower(s) 55 56 # For backward-compatibility, if this variable is set 57 # to true then sequences of multiple blanks and/or dashes 58 # in page and attachment names will be turned into as many 59 # dashes, i.e. a page name like "My page - how nice" becomes 60 # "my-page---how-nice". By default they are squeezed to one 61 # single dash, that is "my-page-how-nice" . 62 # 63 # This hack was necessary to retain backward-compatibility 64 # with older pre-existing TW databases. 65 66 if (_bool(ENVIRON["TNS_LOCAL_HACK_1"]) == _TRUE) s = _strip(s) 67 else s = _strip(s,_O_MIDDLE) 68 69 # This is for backward compatibility with pre-utf8 versions of TW. 70 # The idea is that lower-case Italian UTF-8 accented characters are 71 # turned to a single "x" instead of two, not to break previous TW 72 # page databases: 73 # 74 # agrave \xc3\xa0 75 # egrave \xc3\xa8 76 # eacute \xc3\xa9 77 # igrave \xc3\xac 78 # ograve \xc3\xb2 79 # ugrave \xc3\xb9 80 # 81 # All other UTF-8 characters, including upper-case accented Italian 82 # letters, will be turned into the "xx" sequence. 83 84 if (_REQ_CHARSET == "utf-8" && ENVIRON["CSA_LANG"] ~ /^it/) 85 gsub(/\303(\240|\250|\251|\254|\262|\271)/,"x",s) 86 87 # Same backward-compatibility hack again, see above. 88 if (_bool(ENVIRON["TNS_LOCAL_HACK_1"]) == _TRUE) { 89 gsub(/\t+/,"-",s) 90 gsub(/[- ]/,"-",s) 91 } 92 else gsub(/[- \t]+/,"-",s) 93 94 if (getcat(s) == _NULL) gsub(/\./,"x",s) 95 96 # now let the first well-placed "." leak-through. 97 gsub(/(^\.|\.$|:)/,"x",s) 98 tmp=sub(/\./,":",s) 99 100 # Warning: the next instruction works with gawk(1) only if the 101 # locale(5) is set to POSIX (i.e. if the LANG environment variable 102 # is set to null). Otherwise the regular expression pattern [a-z] 103 # will match also accented letters and the like, in which case the 104 # following (slower) explicit loop must be used instead, which will 105 # work with both mawk(1) and gawk(1). 106 107 gsub(/[^-a-z0-9:]/,"x",s) 108 109 #i = split(s,a,_NULL) 110 #s = _NULL 111 #for (j=1; j<=i; j++) { 112 # if ((e=_atoi(a[j])) != 45 && \ 113 # (e < 48 || e > 122 || (e > 58 && e < 97))) s = s "x" 114 # else s = s a[j] 115 #} 116 117 if (tmp) sub(/:/,".",s) 118 gsub(/:/,"x",s) 119 120 # This is likely to break asciified strings (see further up) 121 # and make them non-unasciifyable. 122 if ((maxlen/=1) > 0) s = substr(s,1,maxlen) 123 124 sub(/^-/,"x",s) 125 126 return s 127 } 128