1 # ===================================================================== 2 # _rdfacpi.awk: replace CPI-encoded RDFa markup in string. 3 # 4 # Copyright (c) 2009 Carlo Strozzi 5 # 6 # This program is free software; you can redistribute it and/or modify 7 # it under the terms of the GNU General Public License as published by 8 # the Free Software Foundation; version 2 dated June, 1991. 9 # 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with this program; if not, write to the Free Software 17 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 # 19 # ===================================================================== 20 # string _rdfacpi(string s) 21 # 22 # A number of considerations are in order regarding this function: 23 # 24 # Only line-level CPIs are used, i.e. no block-level (::x:)...(:x::) . 25 # 26 # Each supported type is handled explicitly. Top level elements 27 # ("organization", "person", etc.) will determine how lower level 28 # elements will be handled. That is, an "address" element at top 29 # level will be handled differently from an "address" element found 30 # at top level. 31 # 32 # Currently Google recognizes also the following structures, which this 33 # function tries to support: 34 # 35 # 1) "organization" data embedded in "review". 36 # 37 # 2) "person" data embedded in "review". 38 # 39 # 3) "product" data embedded in "review". 40 # 41 # 4) "address" data embedded in either "organization" or "person". 42 # 43 # Up to two levels of nesting are supported, such as "address" inside 44 # an "organization" inside a "review". More can be supported in the 45 # future simply by using a multi-valued integer for "subrel" and more 46 # than two work arrays for _mrwresponse() (currently "m" and "m1", see 47 # the code below). 48 # 49 # This function implements RDFa as proposed by Google, which is not 50 # exactly what the actual standard mandates, especially concerning 51 # the use of common and well-understood vocabularies. Please see W3C's 52 # reference docs at http://www.w3.org/MarkUp/2009/rdfa-for-html-authors . 53 # 54 # This local implementation supports also "geo" data, embedded into 55 # either "organization" or "person", because it is considered as an 56 # alternate form of "postal" address here. Note that although it may 57 # seem tempting to allow geo data also as a child element of "address", 58 # this is probably not to be done because, as I said, geographical 59 # coordinates should rather be considered an *alternate* form of 60 # address, not a property of of an ordinary postal address as they can 61 # exist indipendently of the latter. Note also that geo-tagging is NOT 62 # supported on "review" entries, although they may be about places to 63 # visit etc. This limitation can be overcome by resorting to inline 64 # geo-tagging through Microformat's elements. See _mfmtcpi.awk 65 # for more info, as well as further down inside this source code. 66 # 67 # Finally, this program is complicated, so do expect bugs. 68 # 69 # ===================================================================== 70 # 71 # Sample input: 72 # 73 # Sample misplaced entry; since it belongs into the "address" context, 74 # which is not open yet, it is left unparsed: 75 # 76 # (:v-region WA:) 77 # 78 # Open an "organization" context: 79 # 80 # (:v-organization ACME Example 1 Inc.^45.123;9.456:) 81 # (:v-telephone +1 (0)234 123456:) 82 # 83 # In the previous example note the presence of "^45.123;9.456" 84 # immediately following "Komala Vilas". The caret "^" stands for 85 # "Abbreviation" and can be used to define an abbreviation for the 86 # associated name. The exact type of abbreviation depends on the pattern 87 # of caracters that follow the caret itself. Currently only embedded 88 # geo-tagging is supported, but more abbreviation types may be added 89 # in the future. Only one single abbreviation is taken into account, 90 # i.e. if a pattern such as "Text^stuff1^stuff2" is specified, only 91 # "Text^stuff1" will be considered. This seems reasonable since having 92 # multiple abbreviations for the same piece of text seems useless. 93 # Since RDFa has no provisions for abbreviations, I use the same 94 # syntax produced by the Microformat scanner. See the source code of 95 # the latter for more info on this. 96 # 97 # Sample extra entry that does not parse as RDFa 98 # and is left unparsed: 99 # 100 # (:vvvv aaa bbb:) 101 # 102 # Other stuff belonging into the current "organization" context: 103 # 104 # (:v-url http://www1.example.com:) 105 # 106 # Postal Address; since we have an "organization" context already open, 107 # this is treated as that organization's address: 108 # 109 # (:v-address Some Street 1, 123:) 110 # (:v-locality Some Town 1:) 111 # 112 # Entering the same element twice (see previous line) is probably incorrect 113 # but the parser currently accepts it: 114 # 115 # (:v-locality Some Other Town 2:) 116 # 117 # Sample mistyped data (trailing ":" after "region") that will 118 # be left unparsed: 119 # 120 # (:v-region: NJ:) 121 # 122 # Other components of the current "address" context follow: 123 # 124 # (:v-zip 54321:) 125 # (:v-country USA:) 126 # 127 # Organization after an address: 128 # 129 # (:v-organization ACME Example 2 Inc.:) 130 # (:v-telephone +2 (0)234 123456:) 131 # 132 # Sample "product" entry; this is meaningless inside the current "address" 133 # context", so it is sent one level above first, which is "organization", 134 # and since it is invalid there too, a new top-level "product" context 135 # is started: 136 # 137 # (:v-product Gizmo:) 138 # (:v-cat Gadgets:) 139 # (:v-desc A Nifty Useless Tingie:) 140 # (:v-price 123.4:) 141 # (:v-photo http://www.example.com/photos/p25.jpg:) 142 # (:v-url http://www2.example.com/products/p25:) 143 # 144 # Sample "review" entry; this is meaningless inside the previous "product" 145 # context, so a new top-level "review" context is started (reviews are 146 # toplevel-only elements): 147 # 148 # (:v-review Komala Vilas 1:) 149 # (:v-by Meenakshi Ammal:) 150 # (:v-rating 1.1:) 151 # (:v-date 1st April 2005:) 152 # (:v-summary Best vegetarian food 1:) 153 # 154 # Sample "review" entry, with optional URI/URL/CURIE that will be used to 155 # build the RDF "about" property of the entry (this is currently 156 # supported by Google only for Reviews and Review-aggregates): 157 # 158 # (:v-review http://www.example.com Komala Vilas 2:) 159 # (:v-by Meenakshi Ammal:) 160 # (:v-rating 2.2:) 161 # (:v-date 2nd April 2005:) 162 # (:v-summary Best vegetarian food 2:) 163 # 164 # Sample "review" entry, with optional blank-node CURIE: 165 # 166 # (:v-review [_:KomalaResort] Komala Vilas 3:) 167 # (:v-by Meenakshi Ammal:) 168 # (:v-rating 3.3:) 169 # (:v-date 3rd April 2005:) 170 # (:v-summary Best vegetarian food 3:) 171 # 172 # Sample "review aggregate" (this one too supports optional URI/URL/CURIE): 173 # 174 # (:v-areview Komala Vilas 4:) 175 # (:v-rating 3.7:) 176 # (:v-count 20:) 177 # (:v-summary Best vegetarian food 4:) 178 # 179 # Sample "review" entry, with the reviewer being an organization instead 180 # of simple #CDATA with the name of the author; note also the geographical 181 # coordinates: 182 # 183 # (:v-review Komala Vilas 5:) 184 # (:v-organization ACME Example 3 Inc.:) 185 # (:v-telephone +3 (0)234 123456:) 186 # (:v-geo:) 187 # (:v-latitude 45.123:) 188 # (:v-longitude 9.456:) 189 # (:v-address Some Street 2, 123:) 190 # (:v-locality Some Town 3:) 191 # (:v-rating 3.8:) 192 # (:v-date 4th April 2005:) 193 # (:v-summary Best vegetarian food 5:) 194 # 195 # Sample "review", again by an organization, where "v-telephone" has 196 # been placed after the last element of the "address" context ("v-locality"): 197 # 198 # (:v-review Komala Vilas 6:) 199 # (:v-organization ACME Example 4 Inc.:) 200 # (:v-address Some Street 3, 123:) 201 # (:v-locality Some Town 4:) 202 # (:v-telephone +4 (0)234 123456:) 203 # (:v-rating 3.9:) 204 # (:v-date 5th April 2005:) 205 # (:v-summary Best vegetarian food 6:) 206 # 207 # When the parser meets "v-telephone" in the "address" context" it 208 # will bail-out from that context and move up one level, which is 209 # the "organization" context. Since "v-telephone" is valid in that 210 # context, things will parse normally. 211 # 212 # In the following example, "v-telephone" is misplaced after "v-rating": 213 # 214 # (:v-review Komala Vilas 7:) 215 # (:v-organization ACME Example 5 Inc.:) 216 # (:v-address Some Street 4, 123:) 217 # (:v-locality Some Town 5:) 218 # (:v-rating 4.0:) 219 # (:v-telephone +5 (0)234 123456:) 220 # (:v-date 6th April 2005:) 221 # (:v-summary Best vegetarian food 7:) 222 # 223 # When the parser meets "v-rating" in the "address" context it bails-out 224 # to the parent "organization" context, and since "v-rating" is not valid 225 # in that context either, it bails-out further to the "review" context, 226 # where "v-rating" is valid. Then, while in the "review" context, the parser 227 # meets "v-telephone", which is invalid in that context, and the bail-out 228 # process starts again, with the parser climbing backwards until a valid 229 # element/context combination is met. Up to then, all elements encountered 230 # along the way will be left unparsed. Since no elements in the example 231 # belong into any of the upper contexts, they will all be left unparsed 232 # up to the end of the "review" context. 233 # 234 # Entering two adjacent address blocks is probably incorrect, 235 # but the parser currently accepts them: 236 # 237 # (:v-review Komala Vilas 8:) 238 # (:v-organization ACME Example 6 Inc.:) 239 # (:v-address Some Street 5, 123:) 240 # (:v-locality Some Town 6:) 241 # (:v-address Some Other Street 6, 123:) 242 # (:v-locality Some Other Town 7:) 243 # 244 # In the next example a review done by an organization is followed by 245 # a "person" block. Since reviews can take also persons as reviewers, 246 # a "person" occurring while in a "review" context is considered a 247 # reviewer. In this example therefore we will end up with having 248 # two reviewers for the same review, that is an organization and a 249 # person: 250 # 251 # (:v-review Komala Vilas 9^45.123;9.456:) 252 # (:v-organization ACME Example 7 Inc.:) 253 # (:v-telephone +6 (0)234 123456:) 254 # (:v-address Some Street 7, 123:) 255 # (:v-locality Some Town 8:) 256 # (:v-rating 4.1:) 257 # (:v-date 7th April 2005:) 258 # (:v-summary Best vegetarian food 8:) 259 # 260 # (:v-person John Smith Jr. 1:) 261 # (:v-role General Manager 1:) 262 # 263 # Having multiple reviewers in the same "review" context may be wrong for 264 # Google's crawler, but the parser currently accepts it. If considering 265 # "John Smith Jr." a second reviewer of "Komala Vilas" is not what we 266 # mean, we must ensure to tell the parser by inserting a "v-break" element 267 # between the end of the review and the beginning of the person: 268 # 269 # (:v-review Komala Vilas 10:) 270 # (:v-organization ACME Example 8 Inc.:) 271 # (:v-telephone +7 (0)234 123456:) 272 # (:v-address Some Street 8, 123:) 273 # (:v-locality Some Town 9:) 274 # (:v-rating 4.2:) 275 # (:v-date 8th April 2005:) 276 # (:v-summary Best vegetarian food 9:) 277 # 278 # (:v-break:) 279 # 280 # (:v-person John Smith Jr. 2:) 281 # (:v-role General Manager 2:) 282 # 283 # 284 # Sample "person" entry, with an embedded address block: 285 # 286 # (:v-person John Smith Jr. 3:) 287 # (:v-url http://www.example.org/~jsmith:) 288 # (:v-photo http://www.example.org/~jsmith/photo.jpg:) 289 # (:v-address Some Street 9, 892:) 290 # (:v-locality Some Town 10:) 291 # (:v-region NY:) 292 # (:v-zip 56671:) 293 # (:v-country USA:) 294 # 295 # Sample XFN properties of the current "person" context: 296 # (:v-xfn friend Brad 1:) 297 # 298 # Same as above, but with multi-valued 'rel' attribute, which is supported 299 # by the XFN specs but not by Google, so I omit the "v:" namespace prefix 300 # on output: 301 # (:v-xfn friend,met Brad 2:) 302 # 303 # Another XFN properties of the current "person", in a different format: 304 # (:v-xfn acquaintance http://brad-log3.example.org Brad3:) 305 # 306 # Same as above, but with multi-valued 'rel' attribute, which is supported 307 # by the XFN specs but not by Google, so I omit the "v:" namespace prefix 308 # on output: 309 # (:v-xfn co-worker,met http://brad-log4.example.org Brad4:) 310 # 311 # (:v-xfn me Carlo:) 312 # 313 # In the above examples all tags have been written on a separate line 314 # and without any interspersed text. That was done for readability, but 315 # the whole purpose of the RDFa markup is the semantic labelling of 316 # free-form text, so in practical situations a "person" definition 317 # will be surrounded by- and interspersed with text, like this: 318 # 319 # Lorem ipsum dolor sit amet (:v-person John Smith Jr. 4:), consectetur 320 # adipiscing elit (:v-role General Manager 3:). Aenean convallis 321 # scelerisque metus, ... 322 # 323 # Following is the parsed RDFa output corresponding to the above input 324 # with the interspersed running text removed for clarity: 325 # 326 #

328 # 329 # ACME Example 1 330 # Inc. 331 # 332 # +1 (0)234 123456 333 # 334 # http://www1.example.com 335 # 336 # 337 # Some Street 1, 123 338 # Some Town 1 339 # Some Other Town 2 340 # 54321 341 # USA 342 # 343 #

344 #

346 # ACME Example 2 Inc. 347 # +2 (0)234 123456 348 #

349 #

350 # Gizmo 351 # Gadgets 352 # A Nifty Useless Tingie 353 # 123.40 354 # 355 # 356 # http://www.example.com/photos/p25.jpg 357 # 358 # 359 # 360 # http://www2.example.com/products/p25 361 # 362 #

363 #

364 # Komala Vilas 1 365 # Meenakshi Ammal 366 # 1.1 367 # 1st April 2005 368 # Best vegetarian food 1 369 #

370 #

371 # 372 # Komala 373 # Vilas 2 374 # Meenakshi Ammal 375 # 2.2 376 # 2nd April 2005 377 # Best vegetarian food 2 378 # 379 #

380 #

381 # 382 # Komala Vilas 3 384 # Meenakshi Ammal 385 # 3.3 386 # 3rd April 2005 387 # Best vegetarian food 3 388 # 389 #

390 #

392 # Komala Vilas 4 393 # 394 # 3.7 395 # 396 # 20 397 # Best vegetarian food 4 398 #

399 #

400 # Komala Vilas 5 401 # 402 # 403 # ACME Example 3 Inc. 404 # +3 (0)234 123456 405 # 407 # 45.123 408 # 9.456 409 # 410 # 411 # Some Street 2, 123 412 # Some Town 3 413 # 414 # 415 # 416 # 3.8 417 # 4th April 2005 418 # Best vegetarian food 5 419 #

420 #

421 # Komala Vilas 6 422 # 423 # 424 # ACME Example 4 Inc. 425 # 426 # Some Street 3, 123 427 # Some Town 4 428 # 429 # +4 (0)234 123456 430 # 431 # 432 # 3.9 433 # 5th April 2005 434 # Best vegetarian food 6 435 #

436 #

437 # Komala Vilas 7 438 # 439 # 440 # ACME Example 5 Inc. 441 # 442 # Some Street 4, 123 443 # Some Town 5 444 # 445 # 446 # 447 # 4.0(:v-telephone +5 (0)234 448 # 123456:)(:v-date 6th April 2005:)(:v-summary Best vegetarian food 449 # 7:)

450 #

451 # Komala Vilas 8 452 # 453 # 454 # ACME Example 6 Inc. 455 # 456 # Some Street 5, 123 457 # Some Town 6 458 # 459 # 460 # Some Other Street 6, 461 # 123 462 # Some Other Town 7 463 # 464 # 465 # 466 #

467 #

468 # 469 # Komala Vilas 9 470 # 471 # 472 # 473 # ACME Example 7 Inc. 474 # +6 (0)234 123456 475 # 476 # Some Street 7, 123 477 # Some Town 8 478 # 479 # 480 # 481 # 4.1 482 # 7th April 2005 483 # Best vegetarian food 8 484 # 485 # 486 # John Smith Jr. 1 487 # General Manager 1 488 # 489 # 490 #

491 #

492 # Komala Vilas 10 493 # 494 # 495 # ACME Example 8 Inc. 496 # +7 (0)234 123456 497 # 498 # Some Street 8, 123 499 # Some Town 9 500 # 501 # 502 # 503 # 4.2 504 # 8th April 2005 505 # Best vegetarian food 9 506 #

507 #

508 # John Smith Jr. 2 509 # General Manager 2 510 #

511 #

512 # John Smith Jr. 3 513 # 514 # 515 # http://www.example.org/~jsmith 516 # 517 # 518 # 519 # http://www.example.org/~jsmith/photo.jpg 520 # 521 # 522 # Some Street 9, 892 523 # Some Town 10 524 # NY 525 # 56671 526 # USA 527 # 528 # Brad 1 529 # Brad 530 # 2 531 # 532 # Brad3 533 # 534 # Brad4 535 # Carlo 536 #

537 #

538 # John Smith Jr. 4 539 # General Manager 3 540 #

541 # 542 # ===================================================================== 543 # 544 # See also the following additional sources: 545 # 546 # * The source code of the _mrwresponse() CSA function. 547 # * http://en.wikipedia.org/wiki/RDFa 548 # * http://linuxgazette.net/105/misc/oregan/foaf-example.rdf.txt 549 # * http://www.google.com/support/webmasters/bin/answer.py?answer=146898 550 # * http://en.wikipedia.org/wiki/CURIE 551 # * http://www.w3.org/MarkUp/2009/rdfa-for-html-authors (important!) 552 # * http://gmpg.org/xfn/intro 553 # * http://gmpg.org/xfn/background (this is *key* to understanding XFN) 554 # * http://www.gmpg.org/xfn/join 555 # * http://rubhub.com/main/add 556 # * http://www.w3.org/2003/01/geo/ 557 # 558 # This link shows how to easily get long/lat from a Google Map search: 559 # http://lifehacker.com/267361/how-to-find-latitude-and-longitude 560 # 561 # The W3C validation service at http://validator.w3.org/check can be 562 # used to validtate RDFa documents, but it requires the document itself 563 # to be strictly conformant with DOCTYPE XHTML+RDFa 1.0 , i.e. it is 564 # expected to begin with: 565 # 566 # 568 # 569 # This strict conformancy may be problematic to attain, and it can be 570 # overkill if all we need is to test our embedded RDFa statements. 571 # A more "quick-and-dirty" way to see if our RDFa code is OK is to use 572 # W3C's RDFa Distiller Service at http://www.w3.org/2007/08/pyRdfa/ . 573 # 574 # Where appropriate, this function supports "about" properties in the 575 # form of URLs, URIs, CURIEs and blank-node (bnode) CURIEs. 576 # ===================================================================== 577 578 function _rdfacpi(s, a,b,c,scantype,rel,name,value,uri,tag,S,\ 579 re,oldtype,i,j,saved,flag,m,m1,subrel,div) { 580 581 # Get local settings and apply defaults. 582 if ((div=_rcget("TNS_GROUP_MISC_PROP",3)) != "span") div = "div" 583 584 delete a; delete b; delete c; delete m; delete m1 585 586 # Pupulate array of valid tags. Note the presence of the "geo" 587 # tag, currently not part of Google's specs and accepted only for 588 # "organization" and "review", as organizations usually stand at a 589 # fixed geographical location, and reviews may be about a place to 590 # visit, etc. See also http://www.w3.org/2003/01/geo/ . 591 592 tag["addr"] = 1 593 tag["address"] = 1 594 tag["affiliation"] = 1 595 tag["areview"] = 1 596 tag["br"] = 1 597 tag["brand"] = 1 598 tag["break"] = 1 599 tag["by"] = 1 600 tag["cat"] = 1 601 tag["category"] = 1 602 tag["count"] = 1 603 tag["country"] = 1 604 tag["date"] = 1 605 tag["desc"] = 1 606 tag["description"] = 1 607 tag["geo"] = 1 608 tag["lat"] = 1 609 tag["latitude"] = 1 610 tag["loc"] = 1 611 tag["locality"] = 1 612 tag["long"] = 1 613 tag["longitude"] = 1 614 tag["nick"] = 1 615 tag["nickname"] = 1 616 tag["org"] = 1 617 tag["organization"] = 1 618 tag["person"] = 1 619 tag["photo"] = 1 620 tag["price"] = 1 621 tag["prod"] = 1 622 tag["product"] = 1 623 tag["rating"] = 1 624 tag["region"] = 1 625 tag["review"] = 1 626 tag["reviewer"] = 1 627 tag["role"] = 1 628 tag["summary"] = 1 629 tag["tel"] = 1 630 tag["telephone"] = 1 631 tag["title"] = 1 632 tag["url"] = 1 633 tag["xfn"] = 1 634 tag["zip"] = 1 635 636 gsub(/:\)/,"\001",s) 637 638 while (_extract(s,"\\(:v-[^\\001]+\\001")) { 639 640 # Better comment this out while debugging. 641 S = S _RLMATCH 642 643 saved = _RMATCH 644 645 sub(/^\(:v-/,"",_RMATCH) 646 sub(/\001/,"",_RMATCH) 647 name = value = _RMATCH 648 name = tolower(name) 649 sub(/ .*/,"",name) 650 651 if (!tag[name]) { 652 # Better comment this out while debugging. 653 S = S "(:v-" _RMATCH "\001" 654 continue 655 } 656 657 if (!sub(/^[^ ]+ +/,"",value)) value = _NULL 658 659 # Handle each supported type explicitly. 660 661 # review and review-aggregate. 662 if (name ~ /^(a?review|br(eak)?)$/) { 663 664 # Try and detect optional URI in entity name. This is currently 665 # of (possibly future) use only with "review" types and person's 666 # affiliation, as per the relevant Google docs. 667 668 uri = value 669 if (sub(/ .*/,"",uri) && _isuri(uri) == _TRUE) 670 sub(/^[^ ]+ +/,"",value) 671 else uri = "" 672 673 # Like any toplevel-only entries, [a]reviews always bring us 674 # back to top level. If by mistake they occur at a lower level 675 # the remaining entries of such lower level(s) will either be 676 # printed verbatim or they will become part of other aggregates 677 # if compatible. While reviews accept nested "Person" or 678 # "Organization" as the reviewer, review-aggregates do not, 679 # as it would not make sense because each review is (or should 680 # be) from a different source. 681 682 if (subrel) S = S _mrwresponse(m1,".","","","","","","",a,b,1,"!") 683 684 S = S _mrwresponse(m,".","","","","","","",a,b,1,"!") 685 686 scantype = name 687 delete oldtype 688 i = j = rel = 0 689 690 # Handle explicit break requests. 691 if (name ~ /^br/) { 692 scantype = "" 693 continue 694 } 695 696 oldtype[++i] = scantype 697 } 698 699 # organization, possibly within review. 700 else if (name ~ /^org(anization)?$/ && \ 701 (scantype == "" || scantype == "review")) { 702 scantype == "" ? rel = 0 : rel = 1 703 oldtype[++i] = scantype 704 scantype = "org" 705 } 706 707 # product, possibly within review. 708 else if (name ~ /^prod(uct)?$/ && \ 709 (scantype == "" || scantype == "review")) { 710 scantype == "" ? rel = 0 : rel = 1 711 oldtype[++i] = scantype 712 scantype = "prod" 713 } 714 715 # person, possibly within review. 716 else if (name == "person" && \ 717 (scantype == "" || scantype == "review")) { 718 scantype == "" ? rel = 0 : rel = 1 719 oldtype[++i] = scantype 720 scantype = "person" 721 } 722 723 # address (i.e. place) , possibly within either "person" or 724 # "organization". 725 else if (name ~ /^addr(ess)?$/ && \ 726 (scantype == "" || scantype == "person" || scantype == "org")) { 727 scantype == "" ? rel = 0 : rel = 1 728 oldtype[++i] = scantype 729 scantype = "addr" 730 } 731 732 # geographical coordinates, possibly within either "organization" 733 # or "person". 734 else if (name == "geo" && \ 735 (scantype == "" || scantype == "org" || scantype == "person")) { 736 scantype == "" ? rel = 0 : rel = 1 737 oldtype[++i] = scantype 738 scantype = "geo" 739 } 740 741 else if (flag) { 742 if (subrel) S = S _mrwresponse(m1,"",".","","","","","",a,b,1,"!") 743 else S = S _mrwresponse(m,"",".","","","","","",a,b,1,"!") 744 flag = subrel = 0 745 } 746 747 if (scantype == "org") { 748 749 # This must be tested on each loop! 750 if (oldtype[i] == "") rel = 0 751 752 flag = 0 753 754 if (name ~ /^org(anization)?$/) { 755 if (!rel) { 756 a[1] = "xmlns:v" 757 b[1] = "http://rdf.data-vocabulary.org/#" 758 a[2] = "typeof" 759 b[2] = "v:Organization" 760 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 761 } 762 else { 763 a[1] = "rel" 764 b[1] = "v:reviewer" # organization within review 765 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 766 767 # The typeof attribute isn't optional here, as the 768 # reviewer can be either a Person or an Organization, 769 # so it must be stated explicitly to avoid ambiguity. 770 771 a[1] = "typeof" 772 b[1] = "v:Organization" 773 S = S _mrwresponse(m,"","","","","span","","",a,b,1,"!") 774 } 775 776 # Handle optional abbreviation for "v:name". 777 # The caret "^" resembles an "A" and stands for "Abbreviation". 778 if (split(value,c,"^") > 1) { 779 if (c[2] ~ /^[-+]?[0-9]+\.[0-9]+ *; *[-+]?[0-9]+\.[0-9]+/) 780 value = "" c[1] "" 782 } 783 # Add more optional abbreviations below. 784 785 a[1] = "property" 786 b[1] = "v:name" 787 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 788 } 789 else if (name ~ /^tel(ephone)?$/) { 790 a[1] = "property" 791 b[1] = "v:tel" 792 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 793 } 794 else if (name == "url") { 795 a[1] = "rel" 796 b[1] = "v:url" 797 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 798 } 799 else { 800 scantype = oldtype[i--] 801 s = saved s # no match, stack it back. 802 flag = 1 803 } 804 } 805 806 else if (scantype == "person") { 807 808 # This must be checked on each loop! 809 if (oldtype[i] == "") rel = 0 810 811 if (name == "person") { 812 if (!rel) { 813 a[1] = "xmlns:v" 814 b[1] = "http://rdf.data-vocabulary.org/#" 815 a[2] = "typeof" 816 b[2] = "v:Person" 817 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 818 } 819 else { 820 a[1] = "rel" 821 b[1] = "v:reviewer" # person within review 822 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 823 824 # The typeof attribute isn't optional here, as the 825 # reviewer can be either a Person or an Organization, 826 # so it must be stated explicitly to avoid ambiguity. 827 828 a[1] = "typeof" 829 b[1] = "v:Person" 830 S = S _mrwresponse(m,"","","","","span","","",a,b,1,"!") 831 } 832 833 a[1] = "property" 834 b[1] = "v:name" 835 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 836 } 837 else if (name ~ /^nick(name)?$/) { 838 a[1] = "property" 839 b[1] = "v:nickname" 840 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 841 } 842 else if (name ~ /^(org|affiliation)$/){ 843 844 # Try and detect optional URI in entry. This is currently of 845 # (possibly future) use only with "review" types and person's 846 # affiliation, as per the relevant Google docs. 847 848 uri = value 849 if (sub(/ .*/,"",uri) && _isuri(uri) == _TRUE) 850 sub(/^[^ ]+ +/,"",value) 851 else uri = "" 852 853 if (uri == "") { 854 a[1] = "property" 855 b[1] = "v:affiliation" 856 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 857 } 858 else { 859 a[1] = "rel" 860 b[1] = "v:affiliation" 861 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 862 863 a[1] = "about" 864 b[1] = uri 865 a[2] = "property" 866 b[2] = "v:name" 867 868 # Detect also bnode CURIEs, as explained here: 869 # http://www.w3.org/MarkUp/2009/rdfa-for-html-authors 870 if (uri ~ /^\[_:/) { 871 a[4] = "rel" 872 b[4] = "foaf:isPrimaryTopicOf" 873 } 874 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 875 } 876 } 877 else if (name ~ /^(url|photo)$/) { 878 a[1] = "rel" 879 b[1] = "v:" name 880 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 881 } 882 else if (name ~ /^(title|role)$/) { 883 a[1] = "property" 884 b[1] = "v:" name 885 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 886 } 887 888 # Google also recognizes XFN's "friend", "contact" and 889 # "acquaintance" properties, which are used to identify 890 # social relationships in a grassroots fashion typical 891 # of the so-called "microformats", with XFN being a 892 # lightweight alternative to FOAF. For more info see 893 # http://gmpg.org/xfn/intro . This code tries and support 894 # a larger XFN subset than the one supported by Google. 895 # I make no validity checks on the type of XFN rel(s) 896 # specified by the user, so I can potentially support 897 # any of them, even future ones. 898 899 else if (name == "xfn") { 900 name = value = _strip(value,_O_CRUSH) 901 sub(/ .*/,"",name) 902 sub(/[^ ]+ /,"",value) 903 sub(/,/," ",name) 904 905 # Prepend the "v:" namespace prefix only to those 906 # (single) values supported by Google. 907 #if (name ~ /^(friend|contact|acquaintance)$/) name = "v:" name 908 909 if (value ~ /^<[aA] /) { 910 # append to existing rel='' if any, or set one anew. 911 re = "[^a-z]*rel=[\"']" 912 if (value !~ re "[^'\"]*" name && \ 913 !sub(re, "&" name " ",value)) 914 sub(/ /," rel='" name "' ",value) 915 S = S value 916 } 917 else if (split(value,b," ") == 2) { 918 b[3] = b[2] 919 a[1] = "href" 920 a[2] = "rel" 921 b[2] = name 922 S = S _mrwresponse(m,"","","","","","a",b[3],a,b,1,"!") 923 } 924 else { 925 scantype = oldtype[i--] 926 s = saved s # invalid, stack it back. 927 flag = 1 928 } 929 } 930 else { 931 scantype = oldtype[i--] 932 s = saved s # no match, stack it back. 933 flag = 1 934 } 935 } 936 937 else if (scantype == "addr") { 938 939 # This must be checked on each loop! 940 if (oldtype[i] == "") rel = 0 941 942 flag = 0 943 944 if (name ~ /^addr(ess)?$/) { 945 946 if (oldtype[i-1] ~ /^(org|person|review)$/) subrel = 1 947 else subrel = 0 948 949 if (!rel) { 950 a[1] = "xmlns:v" 951 b[1] = "http://rdf.data-vocabulary.org/#" 952 a[2] = "typeof" 953 b[2] = "v:PostalAddress" 954 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 955 } 956 else { 957 a[1] = "rel" 958 b[1] = "v:address" 959 if (subrel) 960 S = S _mrwresponse(m1,"","span","","","","","",a,b,1,"!") 961 else 962 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 963 964 # A "v:address" rel entry can only be an address type, 965 # so no explicity typeof property is necessary. I therefore 966 # omit it for brevity, see Google's RDFa specs. 967 #a[1] = "typeof" 968 #b[1] = "v:PostalAddress" 969 #if (subrel) ... 970 #else ... 971 } 972 a[1] = "property" 973 b[1] = "v:street-address" 974 if (subrel) 975 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 976 else 977 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 978 } 979 else if (name ~ /^loc(ality)?$/) { 980 a[1] = "property" 981 b[1] = "v:locality" 982 if (subrel) 983 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 984 else 985 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 986 } 987 else if (name == "region") { 988 a[1] = "property" 989 b[1] = "v:region" 990 if (subrel) 991 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 992 else 993 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 994 } 995 else if (name == "country") { 996 a[1] = "property" 997 b[1] = "v:country-name" 998 if (subrel) 999 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 1000 else 1001 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1002 } 1003 else if (name == "zip") { 1004 a[1] = "property" 1005 b[1] = "v:postal-code" 1006 if (subrel) 1007 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 1008 else 1009 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1010 } 1011 else { 1012 scantype = oldtype[i--] 1013 s = saved s # no match, stack it back. 1014 flag = 1 1015 } 1016 } 1017 1018 else if (scantype == "prod") { 1019 1020 # This must be checked on each loop! 1021 if (oldtype[i] == "") rel = 0 1022 1023 if (name ~ /^prod(uct)?$/) { 1024 if (!rel) { 1025 a[1] = "xmlns:v" 1026 b[1] = "http://rdf.data-vocabulary.org/#" 1027 a[2] = "typeof" 1028 b[2] = "v:Product" 1029 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 1030 } 1031 else { 1032 a[1] = "rel" 1033 b[1] = "v:itemReviewed" # product within review 1034 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 1035 1036 a[1] = "typeof" 1037 b[1] = "v:Product" 1038 S = S _mrwresponse(m,"","","","","span","","",a,b,1,"!") 1039 } 1040 a[1] = "property" 1041 b[1] = "v:name" 1042 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1043 } 1044 else if (name == "brand") { 1045 a[1] = "property" 1046 b[1] = "v:brand" 1047 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1048 } 1049 else if (name ~ /^cat(egory)?$/) { 1050 a[1] = "property" 1051 b[1] = "v:category" 1052 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1053 } 1054 else if (name ~ /^desc(ription)?$/) { 1055 a[1] = "property" 1056 b[1] = "v:description" 1057 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1058 } 1059 else if (name == "price") { 1060 a[1] = "property" 1061 b[1] = "v:price" 1062 value = sprintf("%.2f",value/1) # RDFa specs. 1063 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1064 } 1065 else if (name == "photo") { 1066 a[1] = "rel" 1067 b[1] = "v:photo" 1068 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1069 } 1070 else if (name == "url") { 1071 a[1] = "rel" 1072 b[1] = "v:url" 1073 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1074 } 1075 else { 1076 scantype = oldtype[i--] 1077 s = saved s # no match, stack it back. 1078 flag = 1 1079 } 1080 } 1081 1082 else if (scantype == "review") { 1083 1084 if (name == "review") { 1085 a[1] = "xmlns:v" 1086 b[1] = "http://rdf.data-vocabulary.org/#" 1087 a[2] = "typeof" 1088 b[2] = "v:Review" 1089 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 1090 1091 # Handle optional abbreviation for "v:name". 1092 # The caret "^" resembles an "A" and stands for "Abbreviation". 1093 if (split(value,c,"^") > 1) { 1094 if (c[2] ~ /^[-+]?[0-9]+\.[0-9]+ *; *[-+]?[0-9]+\.[0-9]+/) 1095 value = "" c[1] "" 1097 } 1098 # Add more optional abbreviations below. 1099 1100 if (uri == "") { 1101 a[1] = "property" 1102 b[1] = "v:itemReviewed" 1103 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1104 } 1105 else { 1106 a[1] = "rel" 1107 b[1] = "v:itemReviewed" 1108 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 1109 1110 a[1] = "about" 1111 b[1] = uri 1112 a[2] = "property" 1113 b[2] = "v:name" 1114 1115 # Detect also bnode CURIEs, as explained here: 1116 # http://www.w3.org/MarkUp/2009/rdfa-for-html-authors 1117 if (uri ~ /^\[_:/) { 1118 a[4] = "rel" 1119 b[4] = "foaf:isPrimaryTopicOf" 1120 } 1121 1122 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1123 } 1124 } 1125 else if (name == "by" || name == "reviewer") { 1126 a[1] = "property" 1127 b[1] = "v:reviewer" 1128 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1129 } 1130 else if (name == "rating") { 1131 a[1] = "property" 1132 b[1] = "v:rating" 1133 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1134 } 1135 else if (name == "date") { 1136 a[1] = "property" 1137 b[1] = "v:dtReviewed" 1138 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1139 } 1140 else if (name == "summary") { 1141 a[1] = "property" 1142 b[1] = "v:summary" 1143 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1144 } 1145 else if (name ~ /^desc(ription)?$/) { 1146 a[1] = "property" 1147 b[1] = "v:description" 1148 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1149 } 1150 else { 1151 scantype = oldtype[i--] 1152 s = saved s # no match, stack it back. 1153 flag = 1 1154 } 1155 } 1156 1157 else if (scantype == "areview") { 1158 1159 if (name == "areview") { 1160 a[1] = "xmlns:v" 1161 b[1] = "http://rdf.data-vocabulary.org/#" 1162 a[2] = "typeof" 1163 b[2] = "v:Review-aggregate" 1164 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 1165 1166 if (uri == "") { 1167 a[1] = "property" 1168 b[1] = "v:itemReviewed" 1169 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1170 } 1171 else { 1172 a[1] = "rel" 1173 b[1] = "v:itemReviewed" 1174 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 1175 1176 a[1] = "about" 1177 b[1] = uri 1178 a[2] = "property" 1179 b[2] = "v:name" 1180 1181 # Detect also bnode CURIEs, as explained here: 1182 # http://www.w3.org/MarkUp/2009/rdfa-for-html-authors 1183 if (uri ~ /^\[_:/) { 1184 a[4] = "rel" 1185 b[4] = "foaf:isPrimaryTopicOf" 1186 } 1187 1188 # Handle optional abbreviation for "v:name". 1189 # The caret "^" resembles an "A" and stands for "Abbreviation". 1190 if (split(value,c,"^") > 1) { 1191 if (c[2] ~ /^[-+]?[0-9]+\.[0-9]+ *; *[-+]?[0-9]+\.[0-9]+/) 1192 value = "" c[1] "" 1194 } 1195 # Add more optional abbreviations below. 1196 1197 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1198 } 1199 } 1200 else if (name == "rating") { 1201 a[1] = "rel" 1202 b[1] = "v:rating" 1203 S = S _mrwresponse(m,"","span","","","","",value,a,b,1,"!") 1204 1205 a[1] = "property" 1206 b[1] = "v:average" 1207 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!") 1208 S = S _mrwresponse(m,"",".","","","","","",a,b,1,"!") 1209 } 1210 else if (name == "count") { 1211 a[1] = "property" 1212 b[1] = "v:count" 1213 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1214 } 1215 else if (name == "summary") { 1216 a[1] = "property" 1217 b[1] = "v:summary" 1218 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1219 } 1220 else { 1221 scantype = oldtype[i--] 1222 s = saved s # no match, stack it back. 1223 flag = 1 1224 } 1225 } 1226 1227 else if (scantype == "geo") { 1228 1229 # This must be tested on each loop! 1230 if (oldtype[i] == "") rel = 0 1231 1232 flag = 0 1233 1234 if (name == "geo") { 1235 1236 if (oldtype[i-1] ~ /^(org|person|review)$/) subrel = 1 1237 else subrel = 0 1238 1239 if (!rel) { 1240 a[1] = "xmlns:v" 1241 b[1] = "http://rdf.data-vocabulary.org/#" 1242 1243 # Geo data is not supported by Google RDFa specs yet 1244 # so I need to point at a different namespace here. 1245 # See http://www.w3.org/2003/01/geo/#example . 1246 1247 a[2] = "xmlns:geo" 1248 b[2] = "http://www.w3.org/2003/01/geo/wgs84_pos#" 1249 1250 a[3] = "typeof" 1251 b[3] = "geo:Point" 1252 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 1253 } 1254 else { 1255 a[1] = "xmlns:geo" 1256 b[1] = "http://www.w3.org/2003/01/geo/wgs84_pos#" 1257 a[2] = "rel" 1258 b[2] = "geo:Point" 1259 if (subrel) 1260 S = S _mrwresponse(m1,"","span","","","","","",a,b,1,"!") 1261 else 1262 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 1263 } 1264 1265 # Ignore any "name" and "schema" spects of the (:v-geo ...:) 1266 # CPI here, as they are not part of the RDFa specs and I 1267 # dont't know how to code them. It will always be possible 1268 # to support them in the future by inserting the appropriate 1269 # code here. 1270 } 1271 else if (name ~ /^lat(itude)?$/) { 1272 a[1] = "property" 1273 b[1] = "geo:lat" 1274 if (subrel) 1275 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 1276 else 1277 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1278 } 1279 else if (name ~ /^long(itude)?$/) { 1280 a[1] = "property" 1281 b[1] = "geo:long" 1282 if (subrel) 1283 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 1284 else 1285 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1286 } 1287 else { 1288 scantype = oldtype[i--] 1289 s = saved s # no match, stack it back. 1290 flag = 1 1291 } 1292 } 1293 1294 else S = S "(:v-" name " " value ":)" 1295 } 1296 1297 # Close the resulting top-level structure. Leave any subrel(s) alone 1298 # for the moment, but it may be necessary to try and uncomment next 1299 # line if any structure errors occur under certain circumstances. 1300 1301 #if (subrel) S = S _mrwresponse(m1,".","","","","","","",a,b,1,"!") 1302 1303 S = S _mrwresponse(m,".","","","","","","",a,b,1,"!") 1304 1305 # Better comment this out while debugging. 1306 S = S _RRMATCH 1307 1308 gsub(/\001/,":)",S) 1309 1310 return S 1311 } 1312