1 # ===================================================================== 2 # trackbackPing: W-TW trackback ping processor. 3 # 4 # Copyright (c) 2007,2009 Carlo Strozzi 5 # 6 # This program is free software; you can redistribute it and/or modify 7 # it under the terms of the GNU General Public License as published by 8 # the Free Software Foundation; version 2 dated June, 1991. 9 # 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with this program; if not, write to the Free Software 17 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 # 19 # ===================================================================== 20 21 # See the following URLs for more information on trackbacks: 22 # 23 # http://www.terenzani.it/19/come-funzionano-i-trackback/ 24 # http://www.sixapart.com/pronet/docs/trackback_spec 25 26 # ===================================================================== 27 # Local variables and functions 28 # ===================================================================== 29 30 csaIsInteractive || { 31 32 # re-define csaExit.{ok,fault} if not interactive. 33 34 fn csaExit.ok { 35 36 csaCommit || csaExit.fault 37 38 cat < 45 46 0 47 48 EOF 49 csaExit 0 50 } 51 52 fn csaExit.fault { 53 54 # This check may never be triggered if the URL rewriting map 55 # (rest.map) only allows POST for this program. Leaving it 56 # in place will not hurt anyway. 57 58 ~ $1 --back && shift # ignore '--back' 59 60 ~ $#* 0 1 || shift 61 62 ~ $1 () && * = 0066 63 64 switch ($1) { 65 66 case 0013 67 xml_err_txt = 'authorization failed' 68 xml_err_len = 119 69 70 case 0054 71 xml_err_txt = 'unable to load the specified source URL' 72 xml_err_len = 138 73 74 case 0071 1003 75 xml_err_txt = \ 76 'trackback protocol violation, see http://www.sixapart.com/pronet/docs/trackback_spec' 77 xml_err_len = 183 78 79 case 0074 80 xml_err_txt = 'requested object not found' 81 xml_err_len = 125 82 83 case 1000 1001 84 xml_err_txt = 'requested resource not found' 85 xml_err_len = 127 86 87 case 1004 88 xml_err_txt = 'unacceptable URL' 89 xml_err_len = 115 90 91 case 1005 92 xml_err_txt = 'the specified URL has already been registered' 93 xml_err_len = 144 94 95 case 1006 96 xml_err_txt = \ 97 'unable to find references to target URL in source URL' 98 xml_err_len = 152 99 100 case * 101 xml_err_txt = 'internal trackback server error' 102 xml_err_len = 130 103 } 104 105 cat < 112 113 1 114 $xml_err_txt 115 116 EOF 117 csaExit 1 118 } 119 } 120 121 cgi.group = () 122 cgi.group.literal = () 123 cgi.page = () 124 cgi.tb.url = () 125 cgi.tb.url.stem = () 126 cgi.grep.pattern = () 127 cgi.tb.name = () 128 cgi.tb.hash = () 129 cgi.tb.excerpt = () 130 cgi.tb.captcha = () 131 132 cgi.localurl.stub = () 133 134 cgi.tb.title = - 135 136 tmp2 = /dev/null 137 138 xml_err_txt = 'internal server error' 139 xml_err_len = 120 140 141 get_url = () 142 143 # ===================================================================== 144 # Main program 145 # ===================================================================== 146 147 csaGetArgs POST 148 149 #~ $REMOTE_ADDR 192.168.1.2 && csaExit.env 150 151 . $CSA_ROOT/lib/group-stuff.rc 152 153 # Check the most important arg. 154 ~ $'cgi.page' () && csaExit.fault 1001 155 156 tw_pstem = $tw_gstem/$'cgi.page' 157 158 CSA_EXIT_SCRIPT = ($CSA_EXIT_SCRIPT back) 159 160 ~ $'cgi.tb.url' () && csaExit.fault --back 1003 161 ~ $'cgi.tb.url' - && csaExit.fault --back 1004 162 ~ $'cgi.localurl.stub' () && csaExit.fault 0066 # should not occur. 163 ~ $'cgi.grep.pattern' () && csaExit.fault 0066 # should not occur. 164 ~ $'cgi.tb.hash' () && csaExit.fault 0066 # should not occur. 165 166 ~ $'cgi.tb.title' - && cgi.tb.title = $'cgi.tb.name' # default 167 168 # To minimize the possibility that the same pinging URL be entered 169 # multiple times with only slight variations, I prefer to check for 170 # duplicates not on the literal URL but rather on a fuzzier value 171 # computed from a normalized version of the URL itself. 172 173 cgi.tb.hash = `{echo $'cgi.tb.hash' | csaSum} 174 175 # Restore default behavior. 176 csaChop $CSA_EXIT_SCRIPT; CSA_EXIT_SCRIPT = $CSA_RESULT 177 178 . $CSA_ROOT/lib/group-editor.rc 179 180 # Load page meta-data. 181 keysearch $'cgi.page' $tw_gstem/page+dat | 182 csa-tbl2rc --prefix tbl_page. > $tmp1 183 . $tmp1 184 185 # The specified page MUST exist or it may not have trackbacks. 186 ~ $'tbl_page.k_page' () && csaExit.fault --back 1001 187 188 # Set template vars to their final values. 189 tpl.var.tw.page = $'tbl_page.p_name' 190 tpl.var.tw.page.object = $'tpl.var.tw.page' 191 tpl.var.html.title = $'tpl.var.tw.group'/$'tpl.var.tw.page' 192 193 # ACLs are loaded earlier than the actual RPC program or its BEGIN 194 # section, so if we rely on them to deny access the client will receive 195 # the standard csaExit.fault response and not the XML one returned by 196 # the local overide of csaExit.fault. This may not be a problem, but I 197 # prefer to abide by the specs and return what the client is supposed to 198 # expect if possible. Anyway, I currently allow public trackbacks, if 199 # this will be abused I'll restrict it. 200 # 201 #~ ,$TNS_AUTH_GRP, *,editor,* || csaExit.fault 401 0013 202 203 # Give the above considerations, as a minimal antispam feature I require 204 # that a captcha code be entered by unauthenticated users. If this 205 # is an interactive trackback submission then the required captcha 206 # code will have been provided to the user in the relevant entry form, 207 # otherwise it will have to have been provided to automated clients 208 # beforehand. Because of this, unattended trackback pinging is currently 209 # not possible. 210 211 # As a minimal antispam feature I require that a captcha code be entered 212 # by unauthenticated users. 213 214 if (!csaTrue $CSA_AUTH_OK) { 215 if (~ $'cgi.tb.captcha' () || !~ $'cgi.tb.captcha' $CSA_SESSION(10)) { 216 217 # Note: whether the "--back" button will actually be effective 218 # depends on how 'csaExit.fault' is defined at this stage, which 219 # in turn depends on whether we are running in interactive mode 220 # or not. 221 csaExit.fault --back 401 0013 222 } 223 } 224 225 # Resolve local URLs. 226 227 if (~ $'cgi.tb.url' /*) { 228 get_url = $'cgi.localurl.stub'$'cgi.tb.url.stem'$'cgi.tb.url' 229 230 # Prepend leading component to local URLs. 231 cgi.tb.url = $'cgi.tb.url.stem'^$'cgi.tb.url' 232 233 } else get_url = $'cgi.tb.url' 234 235 # To be rigorous, locking should be done now, but since curl(1) has 236 # yet to be run I do not want it to keep the lock set for too long, 237 # and since these tests are not so critical anyway I will set the lock 238 # further down. 239 240 # Complain if the specified page body does not exist (should not occur!). 241 csaIsFullPath --exists --quiet $tw_pstem+wki || csaExit.fault 0074 242 243 # Create target table if it does not yet exist. 244 if (!csaIsFullPath --exists --quiet $tw_pstem+tbk) { 245 maketable --input \ 246 $CSA_ROOT/lib/trackback.xrf > $tw_pstem+tbk || 247 csaExit.fault 0003 maketable 248 } 249 250 # Trying to enter an already known referring URI ? 251 grep -qe $tab$'cgi.tb.hash'^'$' $tw_pstem+tbk && csaExit.fault --back 1005 252 253 # Trivial test to see whether the source URL actually links to the 254 # specified target URL. Be forgiving by matching case-insensitively. 255 # Of course this test can be easily fooled, but it is not meant to cope 256 # with intentional tweaking on the part of the client. We only take the 257 # first 50 KBytes of data from the specified source URL, both to prevent 258 # trivial DoS attacks and to conform to the specs (see the referenced 259 # documentation link above). We could read 5 KBytes or less, but it 260 # would fail on typical CSA pages, due to the largish initial RDF block, 261 # so we need to read a bit more. Note that I crush all special and 262 # URI-encoded characters in the received content, to be as forgiving as 263 # possible regarding how the target local URL is represented within the 264 # document at the source URL. 265 266 csaMkTemp tmp2 267 268 # Better use HTTP/1.0 or curl(1) may hang this script by keeping the 269 # connection open, depending on how the remote site works. 270 curl -0 --silent --include --retry 1 --connect-timeout 10 \ 271 --ignore-content-length --range 0-50000 $get_url | tee $tmp2 | sed ' 272 s,#.*,, 273 s,[^-_.:%a-zA-Z0-9],,g 274 y,QWERTYUIOPASDFGHJKLZXCVBNM,qwertyuiopasdfghjklzxcvbnm,' > $tmp1 275 276 csaStatus || csaExit.fault 0054 277 278 # Alternative way, using wget(1). Unfortunately if wget(1) is writing 279 # to a pipe and the latter is closed prematurely, as in this case, it 280 # returns "1" as opposed to SIGPIPE, making it impossible for us to 281 # understand whether the non-zero code was due to an unreachable URL or 282 # to the SIGPIPE, hence the need to act in two steps, as shown. 283 # 284 #wget -q -t 1 -T 10 --ignore-length -O - $get_url | 285 # head -c 50000 | tee $tmp2 | sed ' 286 # s,#.*,, 287 # s,[^-_.:%a-zA-Z0-9],,g 288 # y,QWERTYUIOPASDFGHJKLZXCVBNM,qwertyuiopasdfghjklzxcvbnm,' > $tmp1 289 # 290 #test -s $tmp1 || csaExit.fault 0054 291 292 grep -qiEe $'cgi.grep.pattern'(1) $tmp1 || csaExit.fault --back 1006 293 294 if (~ $'cgi.tb.title' ()) { 295 296 # Try and get the remote page title if not entered explicitly by the 297 # user. Ensure we are not loading binary stuff or something nasty. 298 299 cgi.tb.title = ``(){tr -d -c '[:print:]' < $tmp2 | sed ' 300 s,.*<[tT][iI][tT][lL][eE]>,, 301 s,.*,, 302 s,[<>]\+,,g 303 s,^\(.\{30\}\).*$,\1, 304 '} 305 } 306 307 # Prepare updated values for the page table. 308 tbl_page.k_page = $'cgi.page' 309 tbl_page.p_ntbk = `{expr $'tbl_page.p_ntbk' + 1} 310 311 # Prepare updated values for the trackback table. 312 tbl_track.k_tbkurl = $'cgi.tb.url' 313 tbl_track.t_type = T 314 tbl_track.t_ctime = $CSA_TIME_ISO8601 315 tbl_track.t_creip = $REMOTE_ADDR 316 tbl_track.t_title = $'cgi.tb.title' 317 tbl_track.t_blog = $'cgi.tb.blog_name' 318 tbl_track.t_descr = $'cgi.tb.excerpt' 319 tbl_track.t_hash = $'cgi.tb.hash' 320 321 # Set Principal Lock Semaphore(s) (PLS). 322 csaLock $tw_gstem/page+dat || csaExit.fault 323 324 # Update the page meta-data table. 325 csaOpen --fast $tw_gstem/page+dat || csaExit.fault 326 tmp_pages = $CSA_RESULT 327 328 envtotable --match '^tbl_page__2e[a-z]' --strip-names '^tbl_page__2e' | 329 updtable --stdin --key-columns k_page $tw_gstem/page+dat | 330 sorttable > $tmp_pages 331 332 csaStatus || csaExit.fault 0003 envtotable/updtable/sorttable 333 334 # Update the page trackback meta-data table. 335 csaOpen --fast --relaxed $tw_pstem+tbk || csaExit.fault 336 tmp_track = $CSA_RESULT 337 338 envtotable --match '^tbl_track__2e[a-z]' --strip-names '^tbl_track__2e' | 339 updtable --stdin --key-columns k_tbkurl $tw_pstem+tbk | 340 sorttable > $tmp_track 341 342 csaStatus || csaExit.fault 0003 envtotable/updtable/sorttable 343 344 # Update the page-tbk+xml static view. 345 csaOpen --fast --relaxed $tw_pstem-tbk+xml || csaExit.fault 346 tmp1 = $CSA_RESULT 347 getcolumn --input $tmp_track k_tbkurl t_ctime t_title t_creip | 348 sorttable -r t_ctime > $tmp2 349 csaStatus || csaExit.fault 0003 getcolumn/sorttable 350 351 csaAwkCmd pageTrackBacks.awk 352 $CSA_RESULT < $tmp2 > $tmp1 || csaExit.fault 0003 AWK:pageTrackBacks 353 354 # Update the page-tbk-mt+xml static view. 355 csaOpen --fast --relaxed $tw_pstem-tbk-mt+xml || csaExit.fault 356 tmp1 = $CSA_RESULT 357 358 csaAwkCmd mtTrackBackPings.awk 359 $CSA_RESULT < $tmp2 > $tmp1 || csaExit.fault 0003 AWK:mtTrackBackPings 360 361 # Take the client back to the updated page. 362 363 csaExit.ok $CSA_RPC_URI/$CSA_LANG/$'tbl_group.g_uri'/$'tbl_page.p_uri' 364 365 #EOF