1	# =====================================================================
     2	# _userencode_1.awk: unixify a TW group name string.
     3	#
     4	# Copyright (c) 2007-2011 Carlo Strozzi
     5	#
     6	# This program is free software; you can redistribute it and/or modify
     7	# it under the terms of the GNU General Public License as published by
     8	# the Free Software Foundation; version 2 dated June, 1991.
     9	#
    10	# This program is distributed in the hope that it will be useful,
    11	# but WITHOUT ANY WARRANTY; without even the implied warranty of
    12	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13	# GNU General Public License for more details.
    14	# 
    15	# You should have received a copy of the GNU General Public License 
    16	# along with this program; if not, write to the Free Software
    17	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
    18	#
    19	# =====================================================================
    20	# string _userencode_1(string s [,int maxlen])
    21	#
    22	# Note: to handle locales where most characters are non-ASCII (such as
    23	# Cyrillic, etc.), this algorithm uses _asciify(s), otherwise the
    24	# resulting strings would be made mostly of "x" .
    25	# =====================================================================
    26	
    27	function _userencode_1(s,maxlen,			     a,i,j,e) {
    28	
    29	   s = _strip(s,_O_MIDDLE)
    30	
    31	   # Handle strongly non-ASCII languages. Add more as needed.
    32	   if (_LANG[ENVIRON["CSA_LANG"]] ~ /^(ru|zh)/) {
    33	      s = _asciify(s)
    34	
    35	      # This will make the result non-unasciifiable, but it will
    36	      # retain backward-compatibility. Since unasciifiability is
    37	      # lost anyway, I also remove unnecessary hyphens to get more 
    38	      # compact page names.
    39	      gsub(/-2[0d]/,"\001",s)
    40	      gsub(/-/,_NULL,s)
    41	      gsub(/\001/,"-",s)
    42	   }
    43	
    44	   # Note that mawk's tolower() and toupper() will convert only
    45	   # US-ASCII characters and leave anything else unchanged, but that's
    46	   # fine here, since non US-ASCII stuff will always have to be changed
    47	   # into "x" anyway.
    48	
    49	   s = tolower(s)
    50	
    51	   # This is for backward compatibility with pre-utf8 versions of TW.
    52	   # The idea is that lower-case Italian UTF-8 accented characters are
    53	   # turned to a single "x" instead of two, not to break previous TW
    54	   # page databases:
    55	   #
    56	   # agrave \xc3\xa0
    57	   # egrave \xc3\xa8
    58	   # eacute \xc3\xa9
    59	   # igrave \xc3\xac
    60	   # ograve \xc3\xb2
    61	   # ugrave \xc3\xb9
    62	   #
    63	   # All other UTF-8 characters, including upper-case accented Italian
    64	   # letters, will be turned into the "xx" sequence.
    65	
    66	   if (_REQ_CHARSET == "utf-8" && ENVIRON["CSA_LANG"] ~ /^it/)
    67			gsub(/\303(\240|\250|\251|\254|\262|\271)/,"x",s)
    68	
    69	   gsub(/[- \t]+/,"-",s)
    70	
    71	   # Warning: the next instruction works with gawk(1) only if the
    72	   # locale(5) is set to POSIX (i.e. if the LANG environment variable
    73	   # is set to either null or "POSIX"). Otherwise the regular expression
    74	   # pattern [a-zA-Z] will match also accented letters and the like, in
    75	   # which case the following (slower) explicit loop must be used instead,
    76	   # which will work with both mawk(1) and gawk(1).
    77	
    78	   gsub(/[^-a-z0-9]/,"x",s)				# group name.
    79	
    80	   #i = split(s,a,_NULL)
    81	   #s = _NULL
    82	   #for (j=1; j<=i; j++) {
    83	   #    if ((e=_atoi(a[j])) != 45 && \
    84	   #		(e < 48 || e > 122 || (e > 57 && e < 97))) s = s "x"
    85	   #       else s = s a[j]
    86	   #}
    87	
    88	   # This is likely to break asciified strings (see further up)
    89	   # and make them non-unasciifyable.
    90	   if ((maxlen/=1) > 0) s = substr(s,1,maxlen)
    91	
    92	   sub(/^-/,"x",s)
    93	   return s
    94	}
    95