1 # ===================================================================== 2 # _userencode_1.awk: unixify a TW group name string. 3 # 4 # Copyright (c) 2007-2011 Carlo Strozzi 5 # 6 # This program is free software; you can redistribute it and/or modify 7 # it under the terms of the GNU General Public License as published by 8 # the Free Software Foundation; version 2 dated June, 1991. 9 # 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with this program; if not, write to the Free Software 17 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 # 19 # ===================================================================== 20 # string _userencode_1(string s [,int maxlen]) 21 # 22 # Note: to handle locales where most characters are non-ASCII (such as 23 # Cyrillic, etc.), this algorithm uses _asciify(s), otherwise the 24 # resulting strings would be made mostly of "x" . 25 # ===================================================================== 26 27 function _userencode_1(s,maxlen, a,i,j,e) { 28 29 s = _strip(s,_O_MIDDLE) 30 31 # Handle strongly non-ASCII languages. Add more as needed. 32 if (_LANG[ENVIRON["CSA_LANG"]] ~ /^(ru|zh)/) { 33 s = _asciify(s) 34 35 # This will make the result non-unasciifiable, but it will 36 # retain backward-compatibility. Since unasciifiability is 37 # lost anyway, I also remove unnecessary hyphens to get more 38 # compact page names. 39 gsub(/-2[0d]/,"\001",s) 40 gsub(/-/,_NULL,s) 41 gsub(/\001/,"-",s) 42 } 43 44 # Note that mawk's tolower() and toupper() will convert only 45 # US-ASCII characters and leave anything else unchanged, but that's 46 # fine here, since non US-ASCII stuff will always have to be changed 47 # into "x" anyway. 48 49 s = tolower(s) 50 51 # This is for backward compatibility with pre-utf8 versions of TW. 52 # The idea is that lower-case Italian UTF-8 accented characters are 53 # turned to a single "x" instead of two, not to break previous TW 54 # page databases: 55 # 56 # agrave \xc3\xa0 57 # egrave \xc3\xa8 58 # eacute \xc3\xa9 59 # igrave \xc3\xac 60 # ograve \xc3\xb2 61 # ugrave \xc3\xb9 62 # 63 # All other UTF-8 characters, including upper-case accented Italian 64 # letters, will be turned into the "xx" sequence. 65 66 if (_REQ_CHARSET == "utf-8" && ENVIRON["CSA_LANG"] ~ /^it/) 67 gsub(/\303(\240|\250|\251|\254|\262|\271)/,"x",s) 68 69 gsub(/[- \t]+/,"-",s) 70 71 # Warning: the next instruction works with gawk(1) only if the 72 # locale(5) is set to POSIX (i.e. if the LANG environment variable 73 # is set to either null or "POSIX"). Otherwise the regular expression 74 # pattern [a-zA-Z] will match also accented letters and the like, in 75 # which case the following (slower) explicit loop must be used instead, 76 # which will work with both mawk(1) and gawk(1). 77 78 gsub(/[^-a-z0-9]/,"x",s) # group name. 79 80 #i = split(s,a,_NULL) 81 #s = _NULL 82 #for (j=1; j<=i; j++) { 83 # if ((e=_atoi(a[j])) != 45 && \ 84 # (e < 48 || e > 122 || (e > 57 && e < 97))) s = s "x" 85 # else s = s a[j] 86 #} 87 88 # This is likely to break asciified strings (see further up) 89 # and make them non-unasciifyable. 90 if ((maxlen/=1) > 0) s = substr(s,1,maxlen) 91 92 sub(/^-/,"x",s) 93 return s 94 } 95