1 # ===================================================================== 2 # _mfmtcpi.awk: replace CPI-encoded Microformat markup in string. 3 # 4 # Copyright (c) 2009 Carlo Strozzi 5 # 6 # This program is free software; you can redistribute it and/or modify 7 # it under the terms of the GNU General Public License as published by 8 # the Free Software Foundation; version 2 dated June, 1991. 9 # 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with this program; if not, write to the Free Software 17 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 # 19 # ===================================================================== 20 # string _mfmtcpi(string s) 21 # 22 # A number of considerations are in order regarding this function: 23 # 24 # Only line-level CPIs are used, i.e. no block-level (::x:)...(:x::) . 25 # 26 # Each supported type is handled explicitly. Top level elements 27 # ("organization", "person", etc.) will determine how lower level 28 # elements will be handled. That is, an "address" element at top 29 # level will be handled differently from an "address" element found 30 # at top level. 31 # 32 # Currently Google recognizes also the following structures, which this 33 # function tries to support: 34 # 35 # 1) "organization" data embedded in "review". 36 # 37 # 2) "person" data embedded in "review". 38 # 39 # 3) "product" data embedded in "review". 40 # 41 # 4) "address" data embedded in either "organization" or "person". 42 # 43 # Up to two levels of nesting are supported, such as "address" inside 44 # an "organization" inside a "review". More can be supported in the 45 # future simply by using a multi-valued integer for "subrel" and more 46 # than two work arrays for _mrwresponse() (currently "m", and "m1", 47 # see the code below). 48 # 49 # This local implementation supports also "geo" data, embedded into 50 # either "organization" or "person", because it is considered as an 51 # alternate form of "postal" address here. Note that although it may 52 # seem tempting to allow geo data also as a child element of "address", 53 # this is probably not to be done because, as I said, geographical 54 # coordinates should rather be considered an *alternate* form of 55 # address, not a property of of an ordinary postal address as they can 56 # exist indipendently of the latter. Note also that geo-tagging is NOT 57 # supported on "review" entries, although they may be about places to 58 # visit etc. This limitation can be overcome by resorting to inline 59 # geo-tagging through elements. See further down for more. 60 # 61 # 62 # Finally, this program is complicated, so do expect bugs. 63 # 64 # ===================================================================== 65 # 66 # Sample input: 67 # 68 # Sample misplaced entry; since it belongs into the "address" context, 69 # which is not open yet, it is left unparsed: 70 # 71 # (:v-region WA:) 72 # 73 # Open an "organization" context: 74 # 75 # (:v-organization ACME Example 1 Inc.^45.123;9.456:) 76 # (:v-telephone +1 (0)234 123456:) 77 # 78 # In the previous example note the presence of "^45.123;9.456" 79 # immediately following "Komala Vilas". The caret "^" stands for 80 # "Abbreviation" and can be used to define an abbreviation for the 81 # associated name. The exact type of abbreviation depends on the pattern 82 # of caracters that follow the caret itself. Currently only embedded 83 # geo-tagging is supported, but more abbreviation types may be added 84 # in the future. Only one single abbreviation is taken into account, 85 # i.e. if a pattern such as "Text^stuff1^stuff2" is specified, only 86 # "Text^stuff1" will be considered. This seems reasonable since having 87 # multiple abbreviations for the same piece of text seems useless. 88 # In general, abbreviations are currently supported only for fn-class 89 # hCard entries, and they are only supported to wrap #CDATA content 90 # with no embedded markup. That is, this function can NOT currently 91 # produce the following construct: 92 # 93 #
94 # 95 # Komala Vilas 96 # 97 #
98 # 99 # In fact, the function COULD be modified to produce said code, but 100 # since the input CPI directives (:...:) cannot be nested we have no 101 # way to appropriately describe the input, hence this limitation. 102 # However, it is much easier and apparently just as legal (and even 103 # preferable IMHO) to produce the following equivalent code: 104 # 105 #
106 # 107 # Komala Vilas 108 # 109 #
110 # 111 # and so that's the route that I have taken here. 112 # 113 # See http://www.xfront.com/microformats/hCard_part5.html for more. 114 # 115 # 116 # Sample extra entry that does not parse as RDFa 117 # and is left unparsed: 118 # 119 # (:vvvv aaa bbb:) 120 # 121 # Other stuff belonging into the current "organization" context: 122 # 123 # (:v-url http://www1.example.com:) 124 # 125 # Postal Address; since we have an "organization" context already open, 126 # this is treated as that organization's address: 127 # 128 # (:v-address Some Street 1, 123:) 129 # (:v-locality Some Town 1:) 130 # 131 # Entering the same element twice (see previous line) is probably incorrect 132 # but the parser currently accepts it: 133 # 134 # (:v-locality Some Other Town 2:) 135 # 136 # Sample mistyped data (trailing ":" after "region") that will 137 # be left unparsed: 138 # 139 # (:v-region: NJ:) 140 # 141 # Other components of the current "address" context follow: 142 # 143 # (:v-zip 54321:) 144 # (:v-country USA:) 145 # 146 # Organization after an address: 147 # 148 # (:v-organization ACME Example 2 Inc.:) 149 # (:v-telephone +2 (0)234 123456:) 150 # 151 # Sample "product" entry; this is meaningless inside the current "address" 152 # context", so it is sent one level above first, which is "organization", 153 # and since it is invelid there too, a new top-level "product" context 154 # is started: 155 # 156 # (:v-product Gizmo:) 157 # (:v-cat Gadgets:) 158 # (:v-desc A Nifty Useless Tingie:) 159 # (:v-price 123.4:) 160 # (:v-photo http://www.example.com/photos/p25.jpg:) 161 # (:v-url http://www2.example.com/products/p25:) 162 # 163 # Sample "review" entry; this is meaningless inside the previous "product" 164 # context, so a new top-level "review" context is started (reviews are 165 # toplevel-only elements): 166 # 167 # (:v-review Komala Vilas 1:) 168 # (:v-by Meenakshi Ammal:) 169 # (:v-rating 1.1:) 170 # (:v-date 1st April 2005:) 171 # (:v-summary Best vegetarian food 1:) 172 # 173 # Sample "review" entry, with optional URI/URL/CURIE that will be used to 174 # build the RDF "about" property of the entry (this is currently 175 # supported by Google only for Reviews and Review-aggregates): 176 # 177 # (:v-review http://www.example.com Komala Vilas 2:) 178 # (:v-by Meenakshi Ammal:) 179 # (:v-rating 2.2:) 180 # (:v-date 2nd April 2005:) 181 # (:v-summary Best vegetarian food 2:) 182 # 183 # Sample "review" entry, with optional blank-node CURIE: 184 # 185 # (:v-review [_:KomalaResort] Komala Vilas 3:) 186 # (:v-by Meenakshi Ammal:) 187 # (:v-rating 3.3:) 188 # (:v-date 3rd April 2005:) 189 # (:v-summary Best vegetarian food 3:) 190 # 191 # Sample "review aggregate" (this one too supports optional URI/URL/CURIE): 192 # 193 # (:v-areview Komala Vilas 4:) 194 # (:v-rating 3.7:) 195 # (:v-count 20:) 196 # (:v-summary Best vegetarian food 4:) 197 # 198 # Sample "review" entry, with the reviewer being an organization instead 199 # of simple #CDATA with the name of the author; note also the geographical 200 # coordinates: 201 # 202 # (:v-review Komala Vilas 5:) 203 # (:v-organization ACME Example 3 Inc.:) 204 # (:v-telephone +3 (0)234 123456:) 205 # (:v-geo:) 206 # (:v-latitude 45.123:) 207 # (:v-longitude 9.456:) 208 # (:v-address Some Street 2, 123:) 209 # (:v-locality Some Town 3:) 210 # (:v-rating 3.8:) 211 # (:v-date 4th April 2005:) 212 # (:v-summary Best vegetarian food 5:) 213 # 214 # Sample "review", again by an organization, where "v-telephone" has 215 # been placed after the last element of the "address" context ("v-locality"): 216 # 217 # (:v-review Komala Vilas 6:) 218 # (:v-organization ACME Example 4 Inc.:) 219 # (:v-address Some Street 3, 123:) 220 # (:v-locality Some Town 4:) 221 # (:v-telephone +4 (0)234 123456:) 222 # (:v-rating 3.9:) 223 # (:v-date 5th April 2005:) 224 # (:v-summary Best vegetarian food 6:) 225 # 226 # When the parser meets "v-telephone" in the "address" context" it 227 # will bail-out from that context and move up one level, which is 228 # the "organization" context. Since "v-telephone" is valid in that 229 # context, things will parse normally. 230 # 231 # In the following example, "v-telephone" is misplaced after "v-rating": 232 # 233 # (:v-review Komala Vilas 7:) 234 # (:v-organization ACME Example 5 Inc.:) 235 # (:v-address Some Street 4, 123:) 236 # (:v-locality Some Town 5:) 237 # (:v-rating 4.0:) 238 # (:v-telephone +5 (0)234 123456:) 239 # (:v-date 6th April 2005:) 240 # (:v-summary Best vegetarian food 7:) 241 # 242 # When the parser meets "v-rating" in the "address" context it bails-out 243 # to the parent "organization" context, and since "v-rating" is not valid 244 # in that context either, it bails-out further to the "review" context, 245 # where "v-rating" is valid. Then, while in the "review" context, the parser 246 # meets "v-telephone", which is invalid in that context, and the bail-out 247 # process starts again, with the parser climbing backwards until a valid 248 # element/context combination is met. Up to then, all elements encountered 249 # along the way will be left unparsed. Since no elements in the example 250 # belong into any of the upper contexts, they will all be left unparsed 251 # up to the end of the "review" context. 252 # 253 # Entering two adjacent address blocks is probably incorrect, 254 # but the parser currently accepts them: 255 # 256 # (:v-review Komala Vilas 8:) 257 # (:v-organization ACME Example 6 Inc.:) 258 # (:v-address Some Street 5, 123:) 259 # (:v-locality Some Town 6:) 260 # (:v-address Some Other Street 6, 123:) 261 # (:v-locality Some Other Town 7:) 262 # 263 # In the next example a review done by an organization is followed by 264 # a "person" block. Since reviews can take also persons as reviewers, 265 # a "person" occurring while in a "review" context is considered a 266 # reviewer. In this example therefore we will end up with having 267 # two reviewers for the same review, that is an organization and a 268 # person: 269 # 270 # (:v-review Komala Vilas 9^45.123;9.456:) 271 # (:v-organization ACME Example 7 Inc.:) 272 # (:v-telephone +6 (0)234 123456:) 273 # (:v-address Some Street 7, 123:) 274 # (:v-locality Some Town 8:) 275 # (:v-rating 4.1:) 276 # (:v-date 7th April 2005:) 277 # (:v-summary Best vegetarian food 8:) 278 # 279 # (:v-person John Smith Jr. 1:) 280 # (:v-role General Manager 1:) 281 # 282 # Having multiple reviewers in the same "review" context may be wrong for 283 # Google's crawler, but the parser currently accepts it. If considering 284 # "John Smith Jr." a second reviewer of "Komala Vilas" is not what we 285 # mean, we must ensure to tell the parser by inserting a "v-break" element 286 # between the end of the review and the beginning of the person: 287 # 288 # (:v-review Komala Vilas 10:) 289 # (:v-organization ACME Example 8 Inc.:) 290 # (:v-telephone +7 (0)234 123456:) 291 # (:v-address Some Street 8, 123:) 292 # (:v-locality Some Town 9:) 293 # (:v-rating 4.2:) 294 # (:v-date 8th April 2005:) 295 # (:v-summary Best vegetarian food 9:) 296 # 297 # (:v-break:) 298 # 299 # (:v-person John Smith Jr. 2:) 300 # (:v-role General Manager 2:) 301 # 302 # 303 # Sample "person" entry, with an embedded address block: 304 # 305 # (:v-person John Smith Jr. 3:) 306 # (:v-url http://www.example.org/~jsmith:) 307 # (:v-photo http://www.example.org/~jsmith/photo.jpg:) 308 # (:v-address Some Street 9, 892:) 309 # (:v-locality Some Town 10:) 310 # (:v-region NY:) 311 # (:v-zip 56671:) 312 # (:v-country USA:) 313 # 314 # Sample XFN properties of the current "person" context: 315 # (:v-xfn friend Brad 1:) 316 # 317 # Same as above, but with multi-valued 'rel' attribute, which is supported 318 # by the XFN specs but not by Google, so I omit the "v:" namespace prefix 319 # on output: 320 # (:v-xfn friend,met Brad 2:) 321 # 322 # Another XFN properties of the current "person", in a different format: 323 # (:v-xfn acquaintance http://brad-log3.example.org Brad3:) 324 # 325 # Same as above, but with multi-valued 'rel' attribute, which is supported 326 # by the XFN specs but not by Google, so I omit the "v:" namespace prefix 327 # on output: 328 # (:v-xfn co-worker,met http://brad-log4.example.org Brad4:) 329 # 330 # (:v-xfn me Carlo:) 331 # 332 # In the above examples all tags have been written on a separate line 333 # and without any interspersed text. That was done for readability, but 334 # the whole purpose of the RDFa markup is the semantic labelling of 335 # free-form text, so in practical situations a "person" definition 336 # will be surrounded by- and interspersed with text, like this: 337 # 338 # Lorem ipsum dolor sit amet (:v-person John Smith Jr. 4:), consectetur 339 # adipiscing elit (:v-role General Manager 3:). Aenean convallis 340 # scelerisque metus, ... 341 # 342 # Following is the parsed RDFa output corresponding to the above input 343 # with the interspersed running text removed for clarity: 344 # 345 #
346 # 347 # ACME Example 1 348 # Inc. 349 # 350 # +1 (0)234 123456 351 # 352 # http://www1.example.com 353 # 354 # 355 # Some Street 1, 123 356 # Some Town 1 357 # Some Other Town 2 358 # 54321 359 # USA 360 # 361 #
362 #
363 # ACME Example 2 Inc. 364 # +2 (0)234 123456 365 #
366 #
367 # Gizmo 368 # Gadgets 369 # A Nifty Useless Tingie 370 # 123.40 371 # 372 # 373 # http://www.example.com/photos/p25.jpg 374 # 375 # 376 # 377 # http://www2.example.com/products/p25 378 # 379 #
380 #
381 # 382 # Komala Vilas 1 383 # Meenakshi Ammal 384 # 1.1 385 # 1st April 2005 386 # Best vegetarian food 1 387 # 388 #
389 #
390 # 391 # Komala Vilas 2 392 # Meenakshi Ammal 393 # 2.2 394 # 2nd April 2005 395 # Best vegetarian food 2 396 # 397 #
398 #
399 # 400 # Komala Vilas 3 401 # Meenakshi Ammal 402 # 3.3 403 # 3rd April 2005 404 # Best vegetarian food 3 405 # 406 #
407 #
408 # 409 # Komala Vilas 4 410 # 411 # 412 # 3.7 413 # 414 # 20 415 # Best vegetarian food 4 416 #
417 #
418 # 419 # Komala Vilas 5 420 # 421 # 422 # ACME Example 3 Inc. 423 # +3 (0)234 123456 424 # 425 # Earth 426 # WGS84 427 # 45.123 428 # 9.456 429 # 430 # 431 # Some Street 2, 123 432 # Some Town 3 433 # 434 # 435 # 3.8 436 # 4th April 2005 437 # Best vegetarian food 5 438 #
439 #
440 # 441 # Komala Vilas 6 442 # 443 # 444 # ACME Example 4 Inc. 445 # 446 # Some Street 3, 123 447 # Some Town 4 448 # 449 # +4 (0)234 123456 450 # 451 # 3.9 452 # 5th April 2005 453 # Best vegetarian food 6 454 #
455 #
456 # 457 # Komala Vilas 7 458 # 459 # 460 # ACME Example 5 Inc. 461 # 462 # Some Street 4, 123 463 # Some Town 5 464 # 465 # 466 # 4.0(:v-telephone +5 (0)234 467 # 123456:)(:v-date 6th April 2005:)(:v-summary Best vegetarian food 468 # 7:)
469 #
470 # 471 # Komala Vilas 8 472 # 473 # 474 # ACME Example 6 Inc. 475 # 476 # Some Street 5, 123 477 # Some Town 6 478 # 479 # 480 # Some Other Street 6, 123 481 # Some Other Town 7 482 # 483 # 484 #
485 #
486 # 487 # 488 # Komala Vilas 9 489 # 490 # 491 # 492 # ACME Example 7 Inc. 493 # +6 (0)234 123456 494 # 495 # Some Street 7, 123 496 # Some Town 8 497 # 498 # 499 # 4.1 500 # 7th April 2005 501 # Best vegetarian food 8 502 # 503 # John Smith Jr. 1 504 # General Manager 1 505 # 506 #
507 #
508 # 509 # Komala Vilas 10 510 # 511 # 512 # ACME Example 8 Inc. 513 # +7 (0)234 123456 514 # 515 # Some Street 8, 123 516 # Some Town 9 517 # 518 # 519 # 4.2 520 # 8th April 2005 521 # Best vegetarian food 9 522 #
523 #
524 # John Smith Jr. 2 525 # General Manager 2 526 #
527 #
528 # John Smith Jr. 3 529 # 530 # 531 # http://www.example.org/~jsmith 532 # 533 # 534 # 535 # http://www.example.org/~jsmith/photo.jpg 536 # 537 # 538 # Some Street 9, 892 539 # Some Town 10 540 # NY 541 # 56671 542 # USA 543 # 544 # Brad 1 545 # Brad 546 # 2 547 # 548 # Brad3 549 # 550 # Brad4 551 # Carlo 552 #
553 #
554 # John Smith Jr. 4 555 # General Manager 3 556 #
557 # 558 # ===================================================================== 559 # 560 # See also the following additional sources: 561 # 562 # * The source code of the _mrwresponse() CSA function. 563 # * http://en.wikipedia.org/wiki/Microformat 564 # * http://en.wikipedia.org/wiki/Geo_(microformat) 565 # * http://microformats.org 566 # * http://microformats.org/wiki/geo-extension-strawman 567 # * http://www.google.com/support/webmasters/bin/answer.py?answer=146898 568 # * http://gmpg.org/xfn/intro 569 # * http://www.gmpg.org/xfn/join 570 # * http://rubhub.com/main/add 571 # * http://gmpg.org/xfn/background (this is *key* to understanding XFN) 572 # 573 # This link shows how to easily get long/lat from a Google Map search: 574 # http://lifehacker.com/267361/how-to-find-latitude-and-longitude 575 # 576 # And here is a Microformat validation service: 577 # http://microformatique.com/optimus/ 578 # 579 # Use of tags with hCard: 580 # http://www.xfront.com/microformats/hCard_part5.html 581 # 582 # ===================================================================== 583 584 function _mfmtcpi(s, a,b,c,scantype,rel,name,value,uri,tag,S,\ 585 oldtype,i,j,saved,flag,m,m1,subrel,div) { 586 587 # Get local settings and apply defaults. 588 if ((div=_rcget("TNS_GROUP_MISC_PROP",3)) != "span") div = "div" 589 590 delete a; delete b; delete c; delete m; delete m1 591 592 # Pupulate array of valid tags. Note the presence of the geo-tagging 593 # elements, which ar not (yet) supported by Google but I like to 594 # have them on board right from the start. Albeit the specs tell 595 # that the Geo microformat is a part of the hCard specification, 596 # so it should appare within an hCard, I accept it also within 597 # other elements, as well as standalone. See also this link: 598 # http://en.wikipedia.org/wiki/HCard . 599 600 tag["addr"] = 1 601 tag["address"] = 1 602 tag["affiliation"] = 1 603 tag["areview"] = 1 604 tag["br"] = 1 605 tag["brand"] = 1 606 tag["break"] = 1 607 tag["by"] = 1 608 tag["cat"] = 1 609 tag["category"] = 1 610 tag["count"] = 1 611 tag["country"] = 1 612 tag["date"] = 1 613 tag["desc"] = 1 614 tag["description"] = 1 615 tag["geo"] = 1 616 tag["lat"] = 1 617 tag["latitude"] = 1 618 tag["loc"] = 1 619 tag["locality"] = 1 620 tag["long"] = 1 621 tag["longitude"] = 1 622 tag["nick"] = 1 623 tag["nickname"] = 1 624 tag["org"] = 1 625 tag["organization"] = 1 626 tag["person"] = 1 627 tag["photo"] = 1 628 tag["price"] = 1 629 tag["prod"] = 1 630 tag["product"] = 1 631 tag["rating"] = 1 632 tag["region"] = 1 633 tag["review"] = 1 634 tag["reviewer"] = 1 635 tag["role"] = 1 636 tag["summary"] = 1 637 tag["tel"] = 1 638 tag["telephone"] = 1 639 tag["title"] = 1 640 tag["url"] = 1 641 tag["xfn"] = 1 642 tag["zip"] = 1 643 644 gsub(/:\)/,"\001",s) 645 646 while (_extract(s,"\\(:v-[^\\001]+\\001")) { 647 648 # Better comment this out while debugging. 649 S = S _RLMATCH 650 651 saved = _RMATCH 652 653 sub(/^\(:v-/,"",_RMATCH) 654 sub(/\001/,"",_RMATCH) 655 name = value = _RMATCH 656 name = tolower(name) 657 sub(/ .*/,"",name) 658 659 if (!tag[name]) { 660 # Better comment this out while debugging. 661 S = S "(:v-" _RMATCH "\001" 662 continue 663 } 664 665 if (!sub(/^[^ ]+ +/,"",value)) value = _NULL 666 667 # Handle each supported type explicitly. 668 669 # review and review-aggregate. 670 if (name ~ /^(a?review|br(eak)?)$/) { 671 672 # Try and detect optional URI in entity name. This is currently 673 # of (possibly future) use only with "review" types and person's 674 # affiliation, as per the relevant Google docs. 675 676 uri = value 677 if (sub(/ .*/,"",uri) && _isuri(uri) == _TRUE) 678 sub(/^[^ ]+ +/,"",value) 679 else uri = "" 680 681 # Like any toplevel-only entries, [a]reviews always bring us 682 # back to top level. If by mistake they occur at a lower level 683 # the remaining entries of such lower level(s) will either be 684 # printed verbatim or they will become part of other aggregates 685 # if compatible. While reviews accept nested "Person" or 686 # "Organization" as the reviewer, review-aggregates do not, 687 # as it would not make sense because each review is (or should 688 # be) from a different source. 689 690 if (subrel) S = S _mrwresponse(m1,".","","","","","","",a,b,1,"!") 691 692 S = S _mrwresponse(m,".","","","","","","",a,b,1,"!") 693 694 scantype = name 695 delete oldtype 696 i = j = rel = 0 697 698 # Handle explicit break requests. 699 if (name ~ /^br/) { 700 scantype = "" 701 continue 702 } 703 704 oldtype[++i] = scantype 705 } 706 707 # organization, possibly within review. 708 else if (name ~ /^org(anization)?$/ && \ 709 (scantype == "" || scantype == "review")) { 710 scantype == "" ? rel = 0 : rel = 1 711 oldtype[++i] = scantype 712 scantype = "org" 713 } 714 715 # product, possibly within review. 716 else if (name ~ /^prod(uct)?$/ && \ 717 (scantype == "" || scantype == "review")) { 718 scantype == "" ? rel = 0 : rel = 1 719 oldtype[++i] = scantype 720 scantype = "prod" 721 } 722 723 # person, possibly within review. 724 else if (name == "person" && \ 725 (scantype == "" || scantype == "review")) { 726 scantype == "" ? rel = 0 : rel = 1 727 oldtype[++i] = scantype 728 scantype = "person" 729 } 730 731 # address (i.e. place) , possibly within either "person" or 732 # "organization". 733 else if (name ~ /^addr(ess)?$/ && \ 734 (scantype == "" || scantype == "person" || scantype == "org")) { 735 scantype == "" ? rel = 0 : rel = 1 736 oldtype[++i] = scantype 737 scantype = "addr" 738 } 739 740 # geographical coordinates, possibly within either "organization" 741 # or "person". 742 else if (name == "geo" && \ 743 (scantype == "" || scantype == "org" || scantype == "person")) { 744 scantype == "" ? rel = 0 : rel = 1 745 oldtype[++i] = scantype 746 scantype = "geo" 747 } 748 749 else if (flag) { 750 if (subrel) S = S _mrwresponse(m1,"",".","","","","","",a,b,1,"!") 751 else S = S _mrwresponse(m,"",".","","","","","",a,b,1,"!") 752 flag = subrel = 0 753 } 754 755 if (scantype == "org") { 756 757 # This must be tested on each loop! 758 if (oldtype[i] == "") rel = 0 759 760 flag = 0 761 762 if (name ~ /^org(anization)?$/) { 763 if (!rel) { 764 a[1] = "class" 765 b[1] = "vcard" 766 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 767 } 768 else { 769 a[1] = "class" 770 b[1] = "reviewer" # organization within review 771 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 772 } 773 774 # Handle optional abbreviation for "fn" class. 775 # The caret "^" resembles an "A" and stands for "Abbreviation". 776 if (split(value,c,"^") > 1) { 777 if (c[2] ~ /^[-+]?[0-9]+\.[0-9]+ *; *[-+]?[0-9]+\.[0-9]+/) 778 value = "" c[1] "" 780 } 781 # Add more optional abbreviations below. 782 783 a[1] = "class" 784 b[1] = "fn org" 785 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 786 } 787 else if (name ~ /^tel(ephone)?$/) { 788 a[1] = "class" 789 b[1] = "tel" 790 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 791 } 792 else if (name == "url") { 793 a[1] = "class" 794 b[1] = "url" 795 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 796 } 797 else { 798 scantype = oldtype[i--] 799 s = saved s # no match, stack it back. 800 flag = 1 801 } 802 } 803 804 else if (scantype == "person") { 805 806 # This must be tested on each loop! 807 if (oldtype[i] == "") rel = 0 808 809 if (name == "person") { 810 if (!rel) { 811 a[1] = "class" 812 b[1] = "vcard" 813 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 814 } 815 else { 816 a[1] = "class" 817 b[1] = "reviewer" # person within review 818 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 819 } 820 a[1] = "class" 821 b[1] = "fn" 822 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 823 } 824 else if (name ~ /^nick(name)?$/) { 825 a[1] = "class" 826 b[1] = "nickname" 827 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 828 } 829 else if (name ~ /^(org|affiliation)$/){ 830 831 # Discard optional URI in entry, which is not supprted 832 # with the Microformat output format. 833 834 uri = value 835 if (sub(/ .*/,"",uri) && _isuri(uri) == _TRUE) 836 sub(/^[^ ]+ +/,"",value) 837 a[1] = "class" 838 b[1] = "org" 839 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 840 } 841 else if (name ~ /^(url|photo)$/) { 842 a[1] = "class" 843 b[1] = name 844 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 845 } 846 else if (name ~ /^(title|role)$/) { 847 a[1] = "class" 848 b[1] = name 849 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 850 } 851 852 # Google also recognizes XFN's "friend", "contact" and 853 # "acquaintance" properties, which are used to identify 854 # social relationships in a grassroots fashion typical 855 # of the so-called "microformats", with XFN being a 856 # lightweight alternative to FOAF. For more info see 857 # http://gmpg.org/xfn/intro . This code tries and support 858 # a larger XFN subset than the one supported by Google. 859 # I make no validity checks on the type of XFN rel(s) 860 # specified by the user, so I can potentially support 861 # any of them, even future ones. 862 863 else if (name == "xfn") { 864 name = value = _strip(value,_O_CRUSH) 865 sub(/ .*/,"",name) 866 sub(/[^ ]+ /,"",value) 867 sub(/,/," ",name) 868 869 # Only "friend", "contact" and "acquaintance" are 870 # currently supported by Google, but I support many 871 # more here, possibly the full XFN set, including 872 # multi-values. 873 874 if (value ~ /^<[aA] /) { 875 # append to existing rel='' if any, or set one anew. 876 re = "[^a-z]*rel=[\"']" 877 if (value !~ re "[^'\"]*" name && \ 878 !sub(re, "&" name " ",value)) 879 sub(/ /," rel='" name "' ",value) 880 S = S value 881 } 882 else if (split(value,b," ") == 2) { 883 b[3] = b[2] 884 a[1] = "href" 885 a[2] = "rel" 886 b[2] = name 887 S = S _mrwresponse(m,"","","","","","a",b[3],a,b,1,"!") 888 } 889 else { 890 scantype = oldtype[i--] 891 s = saved s # invalid, stack it back. 892 flag = 1 893 } 894 } 895 else { 896 scantype = oldtype[i--] 897 s = saved s # no match, stack it back. 898 flag = 1 899 } 900 } 901 902 else if (scantype == "addr") { 903 904 # This must be tested on each loop! 905 if (oldtype[i] == "") rel = 0 906 907 flag = 0 908 909 if (name ~ /^addr(ess)?$/) { 910 911 if (oldtype[i] ~ /^(org|person|review)$/) subrel = 1 912 else subrel = 0 913 914 if (!rel) { 915 a[1] = "class" 916 b[1] = "adr" 917 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 918 } 919 else { 920 a[1] = "class" 921 b[1] = "adr" 922 if (subrel) 923 S = S _mrwresponse(m1,"","span","","","","","",a,b,1,"!") 924 else 925 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 926 } 927 a[1] = "class" 928 b[1] = "street-address" 929 if (subrel) 930 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 931 else 932 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 933 } 934 else if (name ~ /^loc(ality)?$/) { 935 a[1] = "class" 936 b[1] = "locality" 937 if (subrel) 938 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 939 else 940 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 941 } 942 else if (name == "region") { 943 a[1] = "class" 944 b[1] = "region" 945 if (subrel) 946 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 947 else 948 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 949 } 950 else if (name == "country") { 951 a[1] = "class" 952 b[1] = "country-name" 953 if (subrel) 954 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 955 else 956 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 957 } 958 else if (name == "zip") { 959 a[1] = "class" 960 b[1] = "postal-code" 961 if (subrel) 962 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 963 else 964 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 965 } 966 else { 967 scantype = oldtype[i--] 968 s = saved s # no match, stack it back. 969 flag = 1 970 } 971 } 972 973 else if (scantype == "prod") { 974 975 # This must be tested on each loop! 976 if (oldtype[i] == "") rel = 0 977 978 if (name ~ /^prod(uct)?$/) { 979 if (!rel) { 980 a[1] = "class" 981 b[1] = "hproduct" 982 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 983 } 984 else { 985 a[1] = "class" 986 b[1] = "hproduct" # product within review 987 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 988 } 989 a[1] = "class" 990 b[1] = "fn" 991 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 992 } 993 else if (name == "brand") { 994 a[1] = "class" 995 b[1] = "brand" 996 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 997 } 998 else if (name ~ /^cat(egory)?$/) { 999 a[1] = "class" 1000 b[1] = "category" 1001 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1002 } 1003 else if (name ~ /^desc(ription)?$/) { 1004 a[1] = "class" 1005 b[1] = "description" 1006 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1007 } 1008 else if (name == "price") { 1009 a[1] = "class" 1010 b[1] = "price" 1011 value = sprintf("%.2f",value/1) # RDFa specs. 1012 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1013 } 1014 else if (name == "photo") { 1015 a[1] = "class" 1016 b[1] = "photo" 1017 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1018 } 1019 else if (name == "url") { 1020 a[1] = "class" 1021 b[1] = "url" 1022 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1023 } 1024 else { 1025 scantype = oldtype[i--] 1026 s = saved s # no match, stack it back. 1027 flag = 1 1028 } 1029 } 1030 1031 else if (scantype == "review") { 1032 1033 # This must be tested on each loop! 1034 if (oldtype[i] == "") rel = 0 1035 1036 if (name == "review") { 1037 a[1] = "class" 1038 b[1] = "hreview" 1039 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 1040 1041 a[1] = "class" 1042 b[1] = "item" 1043 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 1044 1045 # Handle optional abbreviation for "fn" class. 1046 # The caret "^" resembles an "A" and stands for "Abbreviation". 1047 if (split(value,c,"^") > 1) { 1048 if (c[2] ~ /^[-+]?[0-9]+\.[0-9]+ *; *[-+]?[0-9]+\.[0-9]+/) 1049 value = "" c[1] "" 1051 } 1052 # Add more optional abbreviations below. 1053 1054 a[1] = "class" 1055 b[1] = "fn" 1056 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1057 } 1058 else if (name == "by" || name == "reviewer") { 1059 a[1] = "class" 1060 b[1] = "reviewer" 1061 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1062 } 1063 else if (name == "rating") { 1064 a[1] = "class" 1065 b[1] = "rating" 1066 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1067 } 1068 else if (name == "date") { 1069 a[1] = "class" 1070 b[1] = "dtreviewed" 1071 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1072 } 1073 else if (name == "summary") { 1074 a[1] = "class" 1075 b[1] = "summary" 1076 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1077 } 1078 else if (name ~ /^desc(ription)?$/) { 1079 a[1] = "class" 1080 b[1] = "description" 1081 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1082 } 1083 else { 1084 scantype = oldtype[i--] 1085 s = saved s # no match, stack it back. 1086 flag = 1 1087 } 1088 } 1089 1090 else if (scantype == "areview") { 1091 1092 if (name == "areview") { 1093 a[1] = "class" 1094 b[1] = "hreview-aggregate" 1095 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 1096 1097 a[1] = "class" 1098 b[1] = "item" 1099 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 1100 1101 # Handle optional abbreviation for "fn" class. 1102 # The caret "^" resembles an "A" and stands for "Abbreviation". 1103 if (split(value,c,"^") > 1) { 1104 if (c[2] ~ /^[-+]?[0-9]+\.[0-9]+ *; *[-+]?[0-9]+\.[0-9]+/) 1105 value = "" c[1] "" 1107 } 1108 # Add more optional abbreviations below. 1109 1110 a[1] = "class" 1111 b[1] = "fn" 1112 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1113 } 1114 else if (name == "rating") { 1115 a[1] = "class" 1116 b[1] = "rating" 1117 S = S _mrwresponse(m,"","span","","","","",value,a,b,1,"!") 1118 1119 a[1] = "class" 1120 b[1] = "average" 1121 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!") 1122 S = S _mrwresponse(m,"",".","","","","","",a,b,1,"!") 1123 } 1124 else if (name == "count") { 1125 a[1] = "class" 1126 b[1] = "count" 1127 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1128 } 1129 else if (name == "summary") { 1130 a[1] = "class" 1131 b[1] = "summary" 1132 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1133 } 1134 else { 1135 scantype = oldtype[i--] 1136 s = saved s # no match, stack it back. 1137 flag = 1 1138 } 1139 } 1140 1141 else if (scantype == "geo") { 1142 1143 # This must be tested on each loop! 1144 if (oldtype[i] == "") rel = 0 1145 1146 flag = 0 1147 1148 if (name == "geo") { 1149 1150 if (oldtype[i] ~ /^(org|person|review)$/) subrel = 1 1151 else subrel = 0 1152 1153 if (!rel) { 1154 a[1] = "class" 1155 b[1] = "geo" 1156 S = S _mrwresponse(m,div,"","","","","","",a,b,1,"!") 1157 } 1158 else { 1159 a[1] = "class" 1160 b[1] = "geo" 1161 if (subrel) 1162 S = S _mrwresponse(m1,"","span","","","","","",a,b,1,"!") 1163 else 1164 S = S _mrwresponse(m,"","span","","","","","",a,b,1,"!") 1165 } 1166 1167 # reuse vars for workareas, for compactness. 1168 name = value = _strip(value,_O_CRUSH) 1169 1170 sub(/ .*/,"",name) 1171 1172 # The optional "name" and "schema" values of the (:v-geo ...:) 1173 # CPI are taken into account only if they contain at least one 1174 # alphabetic character. 1175 1176 if (!sub(/[^ ]+ /,"",value) || \ 1177 value !~ /[a-zA-Z]/) value = _NULL 1178 1179 # abide by the specs. 1180 if (name !~ /[a-zA-Z]/) name = "Earth" 1181 name = _nlsmap(_NULL,name) 1182 1183 # Default mapping schema for Earth. This may become an 1184 # associative array of default values in the future, one 1185 # for each supported body, so that we do not default to 1186 # an Earth-related schema if the body is, say, "Mars". 1187 1188 if (value == "") value = "WGS84" 1189 1190 a[1] = "class" 1191 b[1] = "body" 1192 if (subrel) 1193 S = S _mrwresponse(m1,"","","","","","span",name,a,b,1,"!",1) 1194 else 1195 S = S _mrwresponse(m,"","","","","","span",name,a,b,1,"!",1) 1196 1197 a[1] = "class" 1198 b[1] = "reference frame" 1199 if (subrel) 1200 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 1201 else 1202 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1203 } 1204 else if (name ~ /^lat(itude)?$/) { 1205 a[1] = "class" 1206 b[1] = "latitude" 1207 if (subrel) 1208 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 1209 else 1210 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1211 } 1212 else if (name ~ /^long(itude)?$/) { 1213 a[1] = "class" 1214 b[1] = "longitude" 1215 if (subrel) 1216 S = S _mrwresponse(m1,"","","","","","span",value,a,b,1,"!",1) 1217 else 1218 S = S _mrwresponse(m,"","","","","","span",value,a,b,1,"!",1) 1219 } 1220 else { 1221 scantype = oldtype[i--] 1222 s = saved s # no match, stack it back. 1223 flag = 1 1224 } 1225 } 1226 1227 else S = S "(:v-" name " " value ":)" 1228 } 1229 1230 # Close the resulting top-level structure. Leave any subrel(s) alone 1231 # for the moment, but it may be necessary to try and uncomment next 1232 # line if any structure errors occur under certain circumstances. 1233 1234 #if (subrel) S = S _mrwresponse(m1,".","","","","","","",a,b,1,"!") 1235 1236 S = S _mrwresponse(m,".","","","","","","",a,b,1,"!") 1237 1238 # Better comment this out while debugging. 1239 S = S _RRMATCH 1240 1241 gsub(/\001/,":)",S) 1242 1243 return S 1244 } 1245