#!/usr/bin/mawk -We
# *********************************************************************
# csa-tbl2rdf: takes a NoSQL table and turns it into a proper RDF
#	       construct
#
# Copyright (c) 2003,2006 Carlo Strozzi
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# *********************************************************************
# $Id: csa-tbl2rdf 43 2007-09-24 08:30:21Z carlo $

BEGIN {
  NULL = "" ; FS = OFS = "\t"; c_type = NULL
  mode = "response"; element = "result"

  # Get local settings.
  csa_install = ENVIRON["CSA_INSTALL"]
  stdout = ENVIRON["CSA_STDOUT"]
  stderr = ENVIRON["CSA_STDERR"]
  rdf_body = ENVIRON["CSA_RDF_BODY"]
  rdf_xsd = ENVIRON["CSA_RDF_XSD"]

  # Set default values if necessary.
  if (csa_install == NULL) csa_install = "/usr/local/csa"
  if (stdout == NULL) stdout = "/dev/stdout"
  if (stderr == NULL) stderr = "/dev/stderr"
  if (rdf_body == NULL) rdf_body = stdout

  #if (rdf_xsd == NULL) rdf_xsd = "&xsd;"

  # Better to always use this form, but it mandates that the
  # relevant namespace be defined in the XMS prologue.
  rdf_xsd = "xsd:"

  double_re = "^[-+]?[0-9]+.[0-9]+$"
  int_re = "^[-+]?[0-9]+$"
  time_re = "^[1-9][0-9]+T[0-9][0-9]:[0-9][0-9]:[0-9][0-9]$"

  while (ARGV[++i] != NULL) {
    if (ARGV[i] == "-l" || ARGV[i] == "--last") pick_last = 1
    else if (ARGV[i] == "-c" || ARGV[i] == "--call") {
       mode = "call"
       element = "param"
    }
    else if (ARGV[i] == "-C" || ARGV[i] == "--count") {
       cnt = ARGV[++i]
       if ((cnt/=1) > 0) cnt--
    }
    else if (ARGV[i] == "-t" || ARGV[i] == "--trim") trim = 1
    else if (ARGV[i] == "-i" || ARGV[i] == "--input") i_file = ARGV[++i]
    else if (ARGV[i] == "-o" || ARGV[i] == "--output") o_file = ARGV[++i]
    else if (ARGV[i] == "-h" || ARGV[i] == "--help") {
       system("grep -v '^#' " csa_install "/help/csa-tbl2rdf.txt")
       exit(rc=1)
    }
  }

  ARGC = 1					# Fix argv[]

  if (o_file == NULL) o_file = stdout
  if (i_file != NULL) { ARGV[1] = i_file; ARGC = 2 }
}

#
# Load the input data.
#

# Column names.
NR == 1 {
  gsub(/\001/,"")			# Remove SOH markers.
  i = 0
  while (++i <= NF) {
    if (!P[$i]) { 
      if (i == 1) auto_col = $i
      else auto_col = auto_col " " $i
    }

    if (pick_last) P[$i] = i
    else {
      if (!P[$i]) P[$i] = i
    }
  }
  split(auto_col, c_names, " ")

  next
}

# Process the table body. Although it would be possible to preserve
# the table data dictionary (i.e. column names) on output, I have
# deliberately choosen not to do so, to stick to the view that an RPC
# mechanism should return unnamed values (possibly arrays).

{

   printf("<%s>\n<rdf:Seq rdf:about=\"#%s.%d\">\n", \
				element,mode,NR-1+cnt) > rdf_body

   i = 0
   while (P[c_names[++i]]) {		# Process each field in turn.

      printf("<rdf:li rdf:datatype=\"") > rdf_body

      c_value = xmlencode(unesctbl($P[c_names[i]]))

      if (trim) { sub(/^[ \t]+/,"",c_value); sub(/[ \t]+$/,c_value) }

      if (c_value != NULL) {
	 if (c_value ~ double_re) c_type = "double"
	 else if (c_value ~ int_re) c_type = "int"
	 else if (c_value ~ time_re) c_type = "dateTime"
	 else c_type = "string"
      }
      else c_type = "string"

      printf("%s%s\">%s</rdf:li>\n",rdf_xsd,c_type,c_value) > rdf_body
   }

   printf("</rdf:Seq></%s>\n",element) > rdf_body
}

# =====================================================================
# string unesctbl(string s)
#
# Un-escape NoSQL special sequences \t, \n and \\
# Note: global variables, like RSTART and RLENGTH cannot be localised
# with their original names, or things won't work.
# =====================================================================

function unesctbl(s,		S,rstart,rlength) {

   while (match(s,/\\+t/)) {

      rstart=RSTART
      rlength=RLENGTH

      if (!(rlength % 2))
	 S = S substr(s,1,rstart+rlength-3) "\t"
      else
	 S = S substr(s,1,rstart+rlength-1)

      s = substr(s,rstart+rlength)
   }

   s = S s
   S = ""

   while (match(s,/\\+n/)) {

      rstart=RSTART
      rlength=RLENGTH

      if (!(rlength % 2))
	 S = S substr(s,1,rstart+rlength-3) "\n"
      else
	 S = S substr(s,1,rstart+rlength-1)

      s = substr(s,rstart+rlength)
   }

   s = S s
   S = ""

   while (match(s,/[\\][\\]/)) {

      rstart=RSTART; rlength=RLENGTH

      S = S substr(s,1,rstart)
      s = substr(s,rstart+2)
   }

   return S s
}

# =====================================================================
# string xmlencode(string s)
#
# Encodes XML special characters & < and " in the input string.
# =====================================================================
function xmlencode(s) {

  gsub(/&/, "\\&amp;", s)
  gsub(/</, "\\&lt;", s)
  gsub(/"/, "\\&quot;", s)
  gsub(/'/, "\\&apos;", s)

  return s
}

# End of program.
