#!/usr/bin/mawk -We
# =====================================================================
# csa-tbl2rc: convert a NoSQL table into rc(1) variable assignments.
#
# Copyright (c) 2002-2019 Carlo Strozzi
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# =====================================================================

BEGIN {
  NULL = "" ;  FS = OFS = "\t"; lists = 0; count = 1; sep = ";"

  # Get local settings.
  csa_install = ENVIRON["CSA_INSTALL"]
  stdout = ENVIRON["CSA_STDOUT"]
  stderr = ENVIRON["CSA_STDERR"]

  # Set default values if necessary.
  if (csa_install == NULL) csa_install = "/usr/local/csa"
  if (stdout == NULL) stdout = "/dev/stdout"
  if (stderr == NULL) stderr = "/dev/stderr"

  while (ARGV[++i] != NULL) {
    if (ARGV[i] == "-p" || ARGV[i] == "--prefix") prefix = ARGV[++i]
    else if (ARGV[i] == "-t" || ARGV[i] == "--trim") trim = 1
    else if (ARGV[i] == "-T" || ARGV[i] == "--trunc") trunc = ARGV[++i]
    else if (ARGV[i] == "-K" || ARGV[i] == "--key") key = ARGV[++i]
    else if (ARGV[i] == "-M" || ARGV[i] == "--max-env") {
         max_env = ARGV[++i]
    }
    else if (ARGV[i] == "-i" || ARGV[i] == "--input") i_file = ARGV[++i]
    else if (ARGV[i] == "-o" || ARGV[i] == "--output") o_file = ARGV[++i]
    else if (ARGV[i] == "-L" || ARGV[i] == "--lists") lists = 1
    else if (ARGV[i] == "-u" || ARGV[i] == "--union") union = 1
    else if (ARGV[i] == "-n" || ARGV[i] == "--no-count") count = 0
    else if (ARGV[i] == "-b" || ARGV[i] == "--break") sep = "\n"
    else if (ARGV[i] == "-S" || ARGV[i] == "--script") cmd = ARGV[++i]
#   else if (ARGV[i] == "-h" || ARGV[i] == "--help") {
#      system("grep -v '^#' " csa_install "/help/csa-tbl2rc.txt")
#      exit(rc=1)
#   }                                   # No help available yet.
  }

  ARGC = 1                                      # Fix argv[]

  if (o_file == NULL) o_file = stdout
  if (i_file != NULL) { ARGV[1] = i_file; ARGC = 2 }
  if (prefix == NULL) prefix = "c_"             # Default prefix.
  else if (prefix == "..NONE..") prefix = NULL

  # Get. max size from environment, if available.
  if (max_env == NULL) max_env = ENVIRON["CSA_DBLOAD_MAX"]

  printf("{ ") > o_file

  # If the input table does not contain all of the fields that
  # are expected by the calling program, it may happen that
  # a pre-existing variable with the same name already contains
  # a value, that will not be what the calling program expects.
  # To prevent this, any environment variables which names begin
  # with the specified prefix are cleared on output, before they
  # are assigned the new values, if any. To avoid clearing important
  # program variables, the programmer should therefore specify
  # a sensible prefix with the '--prefix' option, and this program
  # will have to turn it into a properly escaped regular expression
  # before doing pattern-matching with it.

  if (prefix != NULL) {
     regexp = prefix
     gsub(/[]\\\$()\[\|\^\*\?\.]/,"\\\\&",regexp)
     regexp = "^" regexp

     for (env in ENVIRON) {
         if (env ~ regexp) printf("%s=()%s", env, sep) > o_file
     }
  }

  printf("}%s",sep) > o_file
}


# Table header.

NR == 1 {

  gsub(/\001/,"")                               # remove SOH
  i = 0
  while (++i <= NF) {
    if (!P[$i]) {
       if (i == 1) auto_col = $i
       else auto_col = auto_col " " $i
    }

    if (!P[$i]) P[$i] = i
  }

  split(auto_col, c_names, " ")

  # Make sure we output something valid in any case.
  # Warning: do *not* do this in the BEGIN block, as it must be done
  # *only* if there is actually something on stdin. If an AWK error
  # occurs while reading the input file, the END block will not
  # be executed and the output {} would remain unbalanced.

  if (NR == 1) {
     printf("{ ") > o_file
     bracket = 1
  }

  next
}

# Table body.

{
  if (key != NULL) {
     if ($1 == key) got_key = 1
     else next
  }

  i = nf_tmp = 0
  while (++i <= NF) {                   # Process each field in turn.

    if (trunc) $i = substr($i,1,trunc)  # Truncate if requested.

    # Unescape tabs and newlines first.
    $i = unescape($i)

    if (trim) { sub(/^ +/, NULL, $i); sub(/ +$/, NULL, $i) }

    # Only non-null values are preserved if union. This seems
    # logical to me, and it will prevent a number of issues.
    if ($i == _NULL && union) continue

    # If the "--lists" option was specified, then embedded newlines
    # in the data will be treated as rc(1) list element separators.

    if (lists) j = split($i,a,"\n")
    else a[j=1] = $i

    for (x=1; x<=j; x++) {

        # Embedded single-quotes must be doubled for rc(1).
        gsub(/'/, "''", a[x])

        if (a[x] == NULL) {
           # preserve empty elements if list.
           if (j > 1) a[x] = "'" a[x] "'"
           else a[x] = "()"
        }
        else a[x] = "'" a[x] "'"

        value = value " " a[x]
    }

    # Limit the environment size if requested.
    if (max_env) {
       env_size += length(prefix i $i)
       if (env_size > max_env) break
    }

    printf("%s%s=(%s)%s", prefix, c_names[i], value, sep) > o_file

    nf_tmp++

    value = _NULL
  }

  if (nf_tmp > nf_max) nf_max = nf_tmp

  # Pick only the first matchig row in any case, unless option
  # "--union" was specified, in which case we will effectively
  # print the assignments corresponding to the union set of all
  # values in the input table. If a key value is also specified,
  # then the union will be limited to the matching records.

  if (union) print _NULL        # let's prevent looong lines on output.
  else {
     if (got_key) exit
     else if (NR > 1) exit
  }
}

END {
  if (rc) exit(rc)
  if (!bracket) printf("{ ") > o_file

  # Print total no. of unique assignments.
  if (count) printf(" %s0=%d", prefix, nf_max) > o_file
  printf(" }\n", prefix, nf_max) > o_file

  # Append execution of optional rc(1) script if requested.
  if (cmd != _NULL) print cmd > o_file
}

# *********************************************************************
# string unescape(string s)
#
# Takes a string and translates any unescaped '\t' and '\n' strings into
# physical tabs and newlines respectively. Returns the converted string.
# *********************************************************************
function unescape(s,                    S,i,s_length,a,escaped) {
  s_length = split(s, a, "")
  s_length++                            # Cope with s_length==1
  while (++i <= s_length) {
    if (a[i] == "\\" && !escaped) { escaped = 1; continue }
    if (a[i] == "n" && escaped) { S = S "\n"; escaped = 0; continue }
    if (a[i] == "t" && escaped) { S = S "\t"; escaped = 0; continue }
    if (escaped) { S = S "\\" a[i]; escaped = 0; continue }
    S = S a[i]
  }
  gsub(/\\\\/,"\\",S)                   # Strip '\\' sequences.
  return S
}

# End of program.
