1 # ===================================================================== 2 # pb-discover.awk: W-TW pingback discovery AWK function. 3 # 4 # Copyright (c) 2007,2008,2009,2010 Carlo Strozzi 5 # 6 # This program is free software; you can redistribute it and/or modify 7 # it under the terms of the GNU General Public License as published by 8 # the Free Software Foundation; version 2 dated June, 1991. 9 # 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with this program; if not, write to the Free Software 17 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 # 19 # ===================================================================== 20 21 BEGIN { url=host=body=path=""; port="80"; delete a; i=j=0 } 22 23 # Try and extract the pingback URL from the HTTP response header first 24 # (accounting for the response format produced by 'curl --include'). 25 26 sub(/^ +[0-9]+ [Xx]-[Pp][Ii][Nn][Gg][Bb][Aa][Cc][Kk]: +/,"") { 27 28 # Pingback URL found in header, no need to scan the response body. 29 # See http://www.terenzani.it/26/specifiche-pingback/#toc3 30 31 url = $0 32 exit # jump to final processing. 33 } 34 35 { 36 # unfortunately, spotting the pingback URL in the response body 37 # is more complicated, as teh relevant element may be split 38 # over multiple lines, so we need to buffer everyting. 39 40 body = body "\n" $0 41 } 42 43 END { 44 45 if (url == "") { 46 if (sub(/.*<[Ll][Ii][Nn][Kk][ \t\r\n]+[Rr][Ee][Ll][ \t\r\n]*=[ \t\r\n]*['\"][Pp][Ii][Nn][Gg][Bb][Aa][Cc][Kk]['\"][ \t\r\n]+[Hh][Rr][Ee][Ff][ \t\r\n]*=[ \t\r\n]*['\"]/,"",body)) { 47 sub(/['\"][ \t\r\n]*\/[ \t]*>.*/,"",body) 48 49 # Better to be over-cautious. I do not support https URLs 50 # because the backend nc(1) does not handle them. 51 52 if (body ~ /^[Hh][Tt][tt][Pp]:\/\/.+/) url = body 53 } 54 else if (sub(/['\"][ \t\r\n]+[Rr][Ee][Ll][ \t\r\n]*=[ \t\r\n]*['\"][Pp][Ii][Nn][Gg][Bb][Aa][Cc][Kk]['\"][ \t\r\n]*\/[ \t]*>.*/,"",body)) { 55 sub(/.*<[Ll][Ii][Nn][Kk][ \t\r\n]+[Hh][Rr][Ee][Ff][ \t\r\n]*=[ \t\r\n]*['\"]/,"",body) 56 57 # Better to be over-cautious. I do not support https URLs 58 # because the backend nc(1) does not handle them. 59 60 if (body ~ /^[Hh][Tt][tt][Pp]:\/\/.+/) url = body 61 } 62 } 63 64 if (url != "") { 65 gsub(/[\n\r]+/," ",url) # better to be safe than sorry 66 sub(/^[^:]+:\/\//,"",url) # remove the protocol part. 67 host = substr(url,1,index(url,"/")-1) 68 path = substr(url,index(url,"/")) 69 gsub(/&/,"\\&",path) # Escape XML entity if present. 70 71 if (host ~ /]?:[0-9]+$/) { 72 if (sub(/^\[/,"",host)) { # IPv6 numeric host. 73 port = substr(host,index(host,"]:")+2) 74 sub(/]:.*/,"",host) 75 } 76 else { # all else. 77 port = substr(host,index(host,":")+1) 78 sub(/:.*/,"",host) 79 } 80 } 81 } 82 83 printf("%s\n%s\n%s",host,port,path) 84 } 85 86 # EOF