1 # ===================================================================== 2 # extLink.awk: spot external links and label the as such. 3 # 4 # Copyright (c) 2009 Carlo Strozzi 5 # 6 # This program is free software; you can redistribute it and/or modify 7 # it under the terms of the GNU General Public License as published by 8 # the Free Software Foundation; version 2 dated June, 1991. 9 # 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with this program; if not, write to the Free Software 17 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 # 19 # ===================================================================== 20 21 # ===================================================================== 22 # string extLink(string link, string groupdir) 23 # ===================================================================== 24 25 function extLink(html, tmp,re) { 26 27 re = ENVIRON["CSA_FQDN"] 28 29 # Match on 2nd level domain as opposed of the host name, 30 # which seems reasonable. That is, links to different hosts 31 # in the same domain are NOT considered external links. 32 33 if (re ~ /^[^.]+\.[^.]+\.[^.]+$/) sub (/^[^.]+\./,_NULL,re) 34 35 re = "https?://((.*\\.)?" \ 36 _escreg(re) "|" _escreg("(:localhost:)") ")(/.*)?" 37 38 tmp = html 39 40 sub(/>.*/,_NULL,tmp); sub(/[^>]+/,_NULL,html) 41 42 if (tmp ~ /https?:\/\// && tmp !~ re) { 43 44 # Add to current "class" attribute if defined, or append one anew. 45 if (!sub(/[^a-z]*class=["']/, "&tw-extlink ",tmp)) 46 sub(/$/," class='tw-extlink'",tmp) 47 48 if (_bool(ENVIRON["TNS_EXT_NOFOLLOW"]) == _TRUE) { 49 50 # Add to current "rel" attribute if defined, or append one anew. 51 # note that adding to an existing one may break XFN relations, 52 # but Google's nofollow hack is notoriously badly conceived. 53 54 if (tmp !~ /[^a-z]*rel=["'][^'"]*nofollow/ && \ 55 !sub(/[^a-z]*rel=["']/, "&nofollow ",tmp)) 56 sub(/$/," rel='nofollow'",tmp) 57 } 58 59 # Always set the rel="external" attribute, mainly for validators. 60 if (tmp !~ /[^a-z]*rel=["'][^'"]*external/ && \ 61 !sub(/[^a-z]*rel=["']/, "&external ",tmp)) 62 sub(/$/," rel='external'",tmp) 63 } 64 65 return tmp html 66 } 67 68 # EOF