1 # ===================================================================== 2 # pingbackServer: W-TW pingback processor. 3 # 4 # Copyright (c) 2007,2009 Carlo Strozzi 5 # 6 # This program is free software; you can redistribute it and/or modify 7 # it under the terms of the GNU General Public License as published by 8 # the Free Software Foundation; version 2 dated June, 1991. 9 # 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with this program; if not, write to the Free Software 17 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 # 19 # ===================================================================== 20 21 # ===================================================================== 22 # Note: recent versions of CSA proved all is needed to handle XML-RPC 23 # natively. This program does not use such facilities, which were not 24 # yet available when the program was written, and handles the XML-RPC 25 # dialog directly. The program should be rewritten eventually. 26 # ===================================================================== 27 28 # ===================================================================== 29 # Local variables and functions 30 # ===================================================================== 31 32 # re-define csaExit.{ok,fault} for XML-RPC. 33 # See pingback specs at http://www.terenzani.it/26/specifiche-pingback 34 # 35 # See http://xmlrpc-epi.sourceforge.net/specs/rfc.fault_codes.php for 36 # standard XML-RPC fault codes. 37 38 fn csaExit.ok { 39 40 if (!csaCommit) { 41 xml_err_txt = 'system error' 42 xml_err_num = -32500 43 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 44 csaExit.fault 45 } 46 47 cat < 77 78 79 80 81 82 faultCode 83 $xml_err_num 84 85 86 faultString 87 $xml_err_txt 88 89 90 91 92 93 EOF 94 csaExit 1 95 } 96 97 cgi.group = () 98 cgi.group.literal = () 99 cgi.page = () 100 cgi.pb.url = () 101 cgi.localurl.stub = () 102 cgi.pb.url.stem = () 103 cgi.grep.pattern = () 104 cgi.pb.hash = () 105 106 get_url = () 107 108 tmp1 = /dev/null 109 tmp2 = /dev/null 110 111 xml_err_txt = 'internal server error' 112 xml_err_num = 0 113 xml_err_len = 257 # length(envelope) + length("0") 114 115 # ===================================================================== 116 # Main program 117 # ===================================================================== 118 119 csaGetArgs PUT # may trigger fault 0071, see above. 120 121 if (~ $'cgi.group' ()) { 122 xml_err_txt = 'requested resource not found' 123 xml_err_num = 33 124 xml_err_len = 286 # 256 + length(xml_err_txt) + length("33") 125 csaExit.fault 126 } 127 128 tw_gstem = $CSA_ROOT/var/pages/$CSA_LANG/$'cgi.group' 129 130 CSA_TPL_ROOT = $CSA_ROOT/forms/$CSA_LANG/$'cgi.group' 131 132 tw_dstem = $CSA_ROOT/lib/default 133 134 # Account for the fact that templates are created by the client. 135 CSA_TPL_SAFEPATH = ($CSA_INSTALL/lib $tw_dstem 136 $TMPDIR $tw_gstem $CSA_TPL_ROOT 137 $CSA_DOCROOT/$CSA_LANG/$TNS_ATTACH_PUBDIR) 138 139 CSA_EXIT_SCRIPT = $CSA_ROOT/lib/exit-stuff.rc 140 141 if (~ $'cgi.page' ()) { 142 xml_err_txt = 'requested resource not found' 143 xml_err_num = 33 144 xml_err_len = 286 # 256 + length(xml_err_txt) + length("33") 145 csaExit.fault 146 } 147 148 if (~ $'cgi.pb.url' ()) { 149 xml_err_txt = 'bad or missing argument(s)' 150 xml_err_num = 33 151 xml_err_len = 284 # 256 + length(xml_err_txt) + length("33") 152 csaExit.fault 153 } 154 155 if (~ $'cgi.pb.url' -) { 156 xml_err_txt = 'empty or malformed source URL' 157 xml_err_num = 33 158 xml_err_len = 288 # 256 + length(xml_err_txt) + length("33") 159 csaExit.fault 160 } 161 162 tw_pstem = $tw_gstem/$'cgi.page' 163 164 # Load group-level configuration settings if available. 165 csaIsFullPath --exists --quiet $tw_gstem+cf && . $tw_gstem+cf 166 167 # Load group meta-data (linear search is ok here). 168 csaMkTemp tmp1 tmp2 169 csa-tbl2rc --input $CSA_ROOT/var/pages/$CSA_LANG/group+dat \ 170 --key $'cgi.group' --prefix tbl_group. > $tmp1 171 . $tmp1 172 173 # The specified group MUST exist. 174 ~ $'tbl_group.k_group' () && { 175 xml_err_txt = 'requested resource not found' 176 xml_err_num = 33 177 xml_err_len = 286 # 256 + length(xml_err_txt) + length("33") 178 csaExit.fault 179 } 180 181 . $CSA_ROOT/lib/group-editor.rc 182 183 # Load page meta-data. 184 keysearch $'cgi.page' $tw_gstem/page+dat | 185 csa-tbl2rc --prefix tbl_page. > $tmp1 186 . $tmp1 187 188 # The specified page MUST exist or it may not have pingbacks. 189 ~ $'tbl_page.k_page' () && { 190 xml_err_txt = 'requested resource not found' 191 xml_err_num = 33 192 xml_err_len = 286 # 256 + length(xml_err_txt) + length("33") 193 csaExit.fault 194 } 195 196 if (~ $'cgi.localurl.stub' ()) { 197 xml_err_txt = 'system error' 198 xml_err_num = -32500 199 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 200 csaExit.fault 201 } 202 203 if (~ $'cgi.grep.pattern' ()) { 204 xml_err_txt = 'system error' 205 xml_err_num = -32500 206 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 207 csaExit.fault 208 } 209 210 if (~ $'cgi.pb.hash' ()) { 211 xml_err_txt = 'system error' 212 xml_err_num = -32500 213 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 214 csaExit.fault 215 } 216 217 # To minimize the possibility that the same pinging URL be entered 218 # multiple times with only slight variations, I prefer to check for 219 # duplicates not on the literal URL but rather on a fuzzier value 220 # computed from a normalized version of the URL itself. 221 222 cgi.pb.hash = `{echo $'cgi.pb.hash' | csaSum} 223 224 # Resolve local URLs. 225 if (~ $'cgi.pb.url' /*) { 226 get_url = $'cgi.localurl.stub'$'cgi.pb.url.stem'$'cgi.pb.url' 227 228 # Prepend leading component to local URLs. 229 cgi.pb.url = $'cgi.pb.url.stem'^$'cgi.pb.url' 230 231 } else get_url = $'cgi.pb.url' 232 233 # ACLs are loaded earlier than the actual RPC program or its BEGIN 234 # section, so if we rely on them to deny access the client will receive 235 # the standard csaExit.fault response and not the XML one returned by 236 # the local overide of csaExit.fault. This may not be a problem, but I 237 # prefer to abide by the specs and return what the client is supposed to 238 # expect if possible. Anyway, I currently allow public pingbacks, if 239 # this will be abused I'll restrict it. 240 # 241 #~ ,$TNS_AUTH_GRP, *,editor,* || { 242 # xml_err_txt = 'access denied' 243 # xml_err_num = 49 244 # xml_err_len = 271 # 256 + length(xml_err_txt) + length("49") 245 # csaExit.fault 246 #} 247 248 # To be rigorous, locking should be done now, but since curl(1) has 249 # yet to be run I do not want it to keep the lock set for too long, 250 # and since these tests are not so critical anyway I will set the lock 251 # further down. 252 253 # Complain if the specified page body does not exist (should not occur!). 254 if (!csaIsFullPath --quiet --exists $tw_pstem+wki) { 255 xml_err_txt = 'requested resource not found' 256 xml_err_num = 33 257 xml_err_len = 286 # 256 + length(xml_err_txt) + length("33") 258 csaExit.fault 259 } 260 261 # Create target table if it does not yet exist. 262 if (!csaIsFullPath --exists --quiet $tw_pstem+tbk) { 263 maketable --input \ 264 $CSA_ROOT/lib/trackback.xrf > $tw_pstem+tbk || { 265 xml_err_txt = 'system error' 266 xml_err_num = -32500 267 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 268 csaExit.fault 269 } 270 } 271 272 # trying to enter an already known source URI ? 273 if (grep -qe $tab$'cgi.pb.hash'^'$' $tw_pstem+tbk) { 274 275 # The pingback protocol does not mandate that an error is 276 # thrown if the source URI was already entered but I like 277 # to try and be as compliant as possible. 278 279 xml_err_txt = 'the specified source URI has already been registered' 280 xml_err_num = 48 281 xml_err_len = 356 # 256 + length(xml_err_txt) + length("48") 282 csaExit.fault 283 } 284 285 # Trivial test to see whether the source URI actually links to the 286 # specified target URI. Be forgiving by matching case-insensitively. 287 # Of course this test can be easily fooled, but it is not meant to cope 288 # with intentional tweaking on the part of the client. We only take the 289 # first 50 KBytes of data from the specified source URI, both to prevent 290 # trivial DoS attacks and to conform to the specs (see the referenced 291 # documentation link above). We could read 5 KBytes or less, but it 292 # would fail on typical CSA pages, due to the largish initial RDF block, 293 # so we need to read a bit more. Note that I crush all special and 294 # URI-encoded characters in the received content, to be as forgiving as 295 # possible regarding how the target local URL is represented within the 296 # document at the source URL. 297 298 # Better use HTTP/1.0 or curl(1) may hang this script by keeping the 299 # connection open, depending on how the remote site works. 300 curl -0 --silent --include --retry 1 --connect-timeout 10 \ 301 --ignore-content-length --range 0-50000 $get_url | tee $tmp2 | sed ' 302 s,#.*,, 303 s,[^-_.:%a-zA-Z0-9],,g 304 y,QWERTYUIOPASDFGHJKLZXCVBNM,qwertyuiopasdfghjklzxcvbnm,' > $tmp1 305 306 csaStatus || { 307 xml_err_txt = 'unable to load the specified source URI' 308 xml_err_num = 16 309 xml_err_len = 297 # 256 + length(xml_err_txt) + length("17") 310 csaExit.fault 311 } 312 313 # Alternative way, using wget(1). Unfortunately if wget(1) is writing to a 314 # pipe and the latter is closed prematurely, as in this case, it returns 315 # "1" as opposed to SIGPIPE, making it impossible for us to understand 316 # whether the non-zero code was due to an unreachable URI or to the 317 # SIGPIPE, hence the need to act in two steps, as shown. 318 # 319 #wget -q -t 1 -T 10 --ignore-length -O - $get_url | 320 # head -c 50000 | tee $tmp2 | sed ' 321 # s,#.*,, 322 # s,[^-_.:%a-zA-Z0-9],,g 323 # y,QWERTYUIOPASDFGHJKLZXCVBNM,qwertyuiopasdfghjklzxcvbnm,' > $tmp1 324 # 325 #if (!test -s $tmp1) { 326 # xml_err_txt = 'unable to load the specified source URI' 327 # xml_err_num = 16 328 # xml_err_len = 297 # 256 + length(xml_err_txt) + length("17") 329 # csaExit.fault 330 #} 331 332 if (!grep -qiEe $'cgi.grep.pattern' $tmp1) { 333 xml_err_txt = 'unable to find references to target URI in source URI' 334 xml_err_num = 17 335 xml_err_len = 311 # 256 + length(xml_err_txt) + length("17") 336 csaExit.fault 337 } 338 339 # Prepare updated values for the page table. 340 tbl_page.k_page = $'cgi.page' 341 tbl_page.p_ntbk = `{expr $'tbl_page.p_ntbk' + 1} 342 343 # Prepare updated values for the trackback table. With pingbacks only 344 # a subset of attributes are supported with respect to trackbacks. 345 346 tbl_track.k_tbkurl = $'cgi.pb.url' 347 tbl_track.t_type = P 348 tbl_track.t_ctime = $CSA_TIME_ISO8601 349 tbl_track.t_creip = $REMOTE_ADDR 350 tbl_track.t_hash = $'cgi.pb.hash' 351 352 # Try and get the remote page title. Ensure we are not 353 # loading binary stuff or something nasty. 354 355 tbl_track.t_title = ``(){tr -d -c '[:print:]' < $tmp2 | sed ' 356 s,.*<[tT][iI][tT][lL][eE]>,, 357 s,.*,, 358 s,[<>]\+,,g 359 s,^\(.\{30\}\).*$,\1, 360 '} 361 362 ~ $'tbl_track.t_title' () && tbl_track.t_title = - # default 363 364 # Set Principal Lock Semaphore(s) (PLS). 365 csaLock $tw_gstem/page+dat || { 366 xml_err_txt = 'system error' 367 xml_err_num = -32500 368 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 369 csaExit.fault 370 } 371 372 csaOpen --fast $tw_gstem/page+dat || { 373 xml_err_txt = 'system error' 374 xml_err_num = -32500 375 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 376 csaExit.fault 377 } 378 379 tmp_pages = $CSA_RESULT 380 381 envtotable --match '^tbl_page__2e[a-z]' --strip-names '^tbl_page__2e' | 382 updtable --stdin --key-columns k_page $tw_gstem/page+dat | 383 sorttable > $tmp_pages 384 385 csaStatus || { 386 xml_err_txt = 'system error' 387 xml_err_num = -32500 388 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 389 csaExit.fault 390 } 391 392 # Update the page trackback meta-data table. 393 csaOpen --fast --relaxed $tw_pstem+tbk || { 394 xml_err_txt = 'system error' 395 xml_err_num = -32500 396 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 397 csaExit.fault 398 } 399 400 tmp_track = $CSA_RESULT 401 402 envtotable --match '^tbl_track__2e[a-z]' --strip-names '^tbl_track__2e' | 403 updtable --stdin --key-columns k_tbkurl $tw_pstem+tbk | 404 sorttable > $tmp_track 405 406 csaStatus || { 407 xml_err_txt = 'system error' 408 xml_err_num = -32500 409 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 410 csaExit.fault 411 } 412 413 getcolumn --input $tmp_track k_tbkurl t_ctime t_title t_creip | 414 sorttable -r t_ctime > $tmp2 415 csaStatus || { 416 xml_err_txt = 'system error' 417 xml_err_num = -32500 418 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 419 csaExit.fault 420 } 421 422 # Update the page-tbk+xml static view. 423 csaOpen --fast --relaxed $tw_pstem-tbk+xml || { 424 xml_err_txt = 'system error' 425 xml_err_num = -32500 426 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 427 csaExit.fault 428 } 429 430 tmp1 = $CSA_RESULT 431 432 csaAwkCmd pageTrackBacks.awk 433 $CSA_RESULT < $tmp2 > $tmp1 || { 434 xml_err_txt = 'system error' 435 xml_err_num = -32500 436 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 437 csaExit.fault 438 } 439 440 # Update the page-tbk-mt+xml static view. 441 csaOpen --fast --relaxed $tw_pstem-tbk-mt+xml || { 442 xml_err_txt = 'system error' 443 xml_err_num = -32500 444 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 445 csaExit.fault 446 } 447 448 tmp1 = $CSA_RESULT 449 450 csaAwkCmd mtTrackBackPings.awk 451 $CSA_RESULT < $tmp2 > $tmp1 || { 452 xml_err_txt = 'system error' 453 xml_err_num = -32500 454 xml_err_len = 274 # 256 + length(xml_err_txt) + length("-32500") 455 csaExit.fault 456 } 457 458 csaExit.ok 459 460 #EOF