1 #!/usr/bin/rc -p 2 # ===================================================================== 3 # pb-discover: try and discover whether a web page supports the 4 # Pingback protocol, and return 'host', 'port' and 'path' 5 # if it does. 6 # 7 # Copyright (c) 2007,2008,2009 Carlo Strozzi 8 # 9 # This program is free software; you can redistribute it and/or modify 10 # it under the terms of the GNU General Public License as published by 11 # the Free Software Foundation; version 2 dated June, 1991. 12 # 13 # This program is distributed in the hope that it will be useful, 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 # GNU General Public License for more details. 17 # 18 # You should have received a copy of the GNU General Public License 19 # along with this program; if not, write to the Free Software 20 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 21 # 22 # ===================================================================== 23 24 # ===================================================================== 25 # Program initialization 26 # ===================================================================== 27 28 CSA_PGM = ($CSA_PGM `{basename $0}) 29 if (~ $CSA ()) { 30 echo $CSA_PGM($#CSA_PGM)^:' $CSA not set, cannot continue' >[1=2] 31 exit 1 32 } 33 34 . $CSA || exit 1 35 36 # Even if $CSA exists and parses correctly it may not be what I expect. 37 # So the purpose of the first csaPrintMsg() is twofold: it tells that 38 # the program has started and causes it to exit if the function is 39 # undefined. 40 41 csaPrintMsg 0060 $CSA_PGM($#CSA_PGM) || exit $status 42 43 # ===================================================================== 44 # Local variables and functions 45 # ===================================================================== 46 47 tmp1 = /dev/null 48 49 # ===================================================================== 50 # Main program 51 # ===================================================================== 52 53 ~ $1 http://* https://* || csaExit.fault 0038 54 55 get_url = $1 56 57 csaMkTemp tmp1 58 59 # See pingback specs at http://www.terenzani.it/26/specifiche-pingback 60 # 61 # We only take the first 20 KBytes of data from the specified source 62 # URL, both to prevent trivial DoS attacks and to conform to the specs 63 # (see the referenced documentation link above). We could read 5 KBytes 64 # or less, but it would fail on typical CSA pages, due to the largish 65 # initial RDF block, so we need to read a bit more. Better use HTTP/1.0 66 # or curl(1) may hang this script by keeping the connection open, 67 # depending on how the remote site works. 68 69 curl -0 --silent --include --retry 1 --connect-timeout 10 \ 70 --ignore-content-length --range 0-20000 \ 71 --output $tmp1 $get_url || csaExit.fault 0054 72 73 # Alternative way, using wget(1). Unfortunately if wget(1) is writing 74 # to a pipe and the latter is closed prematurely, as in this case, it 75 # returns "1" as opposed to SIGPIPE, making it impossible for us to 76 # understand whether the non-zero code was due to an unreachable URL or 77 # to the SIGPIPE, hence the need to act in two steps, as shown. 78 # 79 #wget -S -t 1 -T 10 --ignore-length \ 80 # -O - $get_url >[2=1] | head -c 20000 > $tmp1 81 # 82 #test -s $tmp1 || csaExit.fault 0054 83 84 # If all goes well, this will print a newline-separated list of 85 # host,port,path to stdout. 86 87 csaAwkCmd pb-discover.awk 88 $CSA_RESULT -- $tmp1 || csaExit.fault 0003 AWK 89 90 csaExit.ok # mandatory ! 91 92 # End of program.