#!/usr/bin/env ruby

# Read a pair of xml files containing database dumps of one or more
# "fn_soundGroup" elements and related "personnel" info, and write a
# corresponding set of xml files containing selected information in
# conformance with the "ISLE Meta Data Initiatve" (IMDI) standard for
# describing "multi-media and multi-modal language resources." For
# details on IMDI, consult https://tla.mpi.nl/imdi-metadata/ ("The
# Language Archive at the Max Planck Institute for
# Psycholinguistics").

# Output files are created in a directory called "./imdi" (which is
# created if necessary); file names are based on the "Project/ID"

# Script originally created by David Graff; Version 3: May 13, 2017.

# This is free software; you may copy, modify and redistribute at
# will, provided that you maintain mention of the original author, and
# provide commentary in the source code to describe the changes you
# make (if any) to the version you received.

require 'nokogiri'

unless ARGV.size == 2 and File.file?( ARGV[0] ) and File.file?( ARGV[1] )
  $stderr.puts "Usage: #{$0} personnel_data.xml session_data.xml"
  exit 1
end
Dir.mkdir( "imdi" ) unless File.directory?( "imdi")
persfile, sessfile = ARGV[0..1]

persons = Hash.new{|h,k| h[k] = Hash.new} # persons_by_name["last_name, first_name"] = { :code => ..., :birthdate => ... }

keywords = {
  'Cazas' => [ 'Ethnobiology', 'Ethnobotany', 'Economic botany', 'Plants', 'Hunting and Fishing', 'Material culture' ],
  'Botan' => [ 'Ethnobiology', 'Ethnobotany', 'Economic botany', 'Plants', 'Plant names', 'Plant uses' ],
  'Comid' => [ 'Ethnobiology', 'Ethnobotany', 'Economic botany', 'Plants', 'Plant names', 'Plant uses', 'Food preparation', 'Edible plants' ],
  'MatCl' => [ 'Ethnobiology', 'Ethnobotany', 'Economic botany', 'Plants', 'Plant names', 'Plant uses', 'Material culture', 'Tools' ],
  'Medic' => [ 'Ethnobiology', 'Ethnobotany', 'Economic botany', 'Plants', 'Plant names', 'Plant uses', 'Disease', 'Medicinal plants', 'Sickness', 'Health' ],
}

xml_p = Nokogiri::XML( File.open( persfile ))
lname, fname = nil, nil
xml_p.xpath( '/database/last_nameGroup' ).each do |pgrp|
  pgrp.element_children.each do |pchild|
    ptag = pchild.name
    ptxt = pchild.text
    if ( ptag == "last_name" )
      lname = ptxt
    elsif ( ptag == "first_name" )
      fname = ptxt
    elsif ( ptag == "code" )
      persons["#{fname} #{lname}"][:Code] = ptxt
    elsif ( ptag == "birthdate" )
      persons["#{fname} #{lname}"][:Birthdate] = ( ptxt =~ /(\d{4})/ ) ? $1 : "Unspecified"
    end
  end
end

sessions = []
xml_i = Nokogiri::XML( File.open( sessfile ))
xml_i.xpath( '/database/fn_soundGroup' ).each do |fnsgrp|
  fnsgrp.element_children.each do |fchild|
    nname = fchild.name
    ntext = fchild.text
    if ( nname == "fn_sound" ) # this is always the first child node
      fid = ntext.match( /[_-](20\d\d-\d\d-\d\d-[a-z])/ )[1]
      sessions.push( { :FileID => fid, :Name => ntext.sub( /.wav$/, "" ) } )
    elsif ( nname == "titspn" )
      sessions[-1][:Title] = ntext
    elsif ( nname == "descrip" )
      sessions[-1][:Descrip] = ntext
    elsif ( nname.match( /^(duration|size|date|rec_[fm]\w+)/ ))
      lbl = nname.capitalize.to_sym
      sessions[-1][lbl] = ntext
    elsif ( nname.match( /contr(\d)/ ))
      cnum = $1
      sessions[-1][:Actors] = [] if cnum == "1"
      sessions[-1][:Actors].push( Hash.new )
      aname = ntext.sub( /^(.*), (.*)/ ){ "#{$2} #{$1}" }
      sessions[-1][:Actors][-1][:Name] = aname
      $stderr.puts "Unknown person: #{aname}" unless ( persons.has_key?( aname ))
    elsif ( nname.match( /con(\d)_role/ ))
      sessions[-1][:Actors][$1.to_i - 1][:Role] = ntext
    elsif ( nname.match( /con(\d)_(\w+)/ ))
      cnum = $1.to_i - 1
      clbl = $2.capitalize.to_sym
      if ( persons[ sessions[-1][:Actors][cnum][:Name] ].has_key?( clbl ))
        if ( persons[ sessions[-1][:Actors][cnum][:Name] ][clbl] != ntext )
          $stderr.puts "discrepancy: FileID=%s Name=/%s/ Label=/%s/: %s vs. %s" %
                       [ sessions[-1][:FileID], sessions[-1][:Actors][cnum][:Name], 
                         clbl.to_s, ntext, persons[sessions[-1][:Actors][cnum][:Name]][clbl] ]
        end
      else
        persons[ sessions[-1][:Actors][cnum][:Name] ][clbl] = ntext
      end
    elsif ( nname == "genreGroup" )
      sessions[-1][:Keys] = []
      fchild.element_children.each do |cchild|
        if ( cchild.name == 'subgenre' )
          sessions[-1][:Keys] = cchild.text.split( /\s*\p{Punct}+\s*/ )
        end
      end
    end
  end
end

sessions.each do |ssn|
  xml_o = Nokogiri::XML::Builder.new do |xml|
    xml.METATRANSCRIPT( {
                          'ArchiveHandle' => "", 
                          'Date' => "2016-01-29", 'FormatId' => "IMDI 3.03",
                          'Originator' => "",
                          'Type' => "SESSION", 'Version' => "0",
                          'NS2:schemaLocation' => "http://www.mpi.nl/IMDI/Schema/IMDI ./IMDI_3.0.xsd",
                          'xmlns' =>"http://www.mpi.nl/IMDI/Schema/IMDI",
                          'xmlns:NS2' => "http://www.w3.org/2001/XMLSchema-instance" } ) {
      xml.Session {
        xml.Name ssn[:Name]
        xml.Title ssn[:Title]
        xml.Date ssn[:Date]
        xml.Description( { :LanguageId => "ISO639-3:spa", :Link => "" }, ssn[:Descrip] )
        xml.MDGroup {
          xml.Location {
            xml.Continent "Middle America"
            xml.Country "Mexico"
            xml.Region "Sierra Nororiental de Puebla"
          }
          xml.Project {
            xml.Name "0259-IPF0178"
            xml.Title "Documentation of Nahuat Knowledge of Natural History, Material Culture, Medicine, Hunting and Fishing, and Ecology"
            xml.Id "MDP0272"
            xml.Contact {
              xml.Name "Jonathan D. Amith"
              xml.Email "nahuatl.biology@gmail.com"
            }
          }
          xml.Keys {
            xml.Key( { :Name => "SessionId", :Type => "OpenVocabulary" }, ssn[:Title] )
            xml.Key( { :Name => "CorpusId", :Type => "OpenVocabulary" }, "0292" )
          }
          xml.Content {
            xml.Genre "Discourse"
            subg = ( ssn[:Actors].size == 1 ) ? "Description" :
                   ( ssn[:Actors][1][:Role] == 'Interviewer' ) ? "Interview" : "Conversation"
            xml.SubGenre( { :Link => "http://www.mpi.nl/IMDI/Schema/Content-SubGenre.xml", :Type => "OpenVocabularyList" }, subg )
            xml.CommunicationContext
            xml.Languages {
              xml.Language {
                xml.Id "ISO639:azz"
                xml.Name( { :Link => "http://www.mpi.nl/IMDI/Schema/MPI-Languages.xml", :Type => "OpenVocabulary" }, 
                          "Sierra Nororiental de Puebla Nahuat" )
                xml.Description( { :LanguageId => "ISO639-3:eng", :Link => "" }, "subject language" )
              }
              xml.Language {
                xml.Id "ISO639:eng"
                xml.Name( { :Link => "http://www.mpi.nl/IMDI/Schema/MPI-Languages.xml", :Type => "OpenVocabulary" }, "English" )
                xml.Description( { :LanguageId => "ISO639-3:eng", :Link => "" }, "working language" )
              }
              xml.Language {
                xml.Id "ISO639:spa"
                xml.Name( { :Link => "http://www.mpi.nl/IMDI/Schema/MPI-Languages.xml", :Type => "OpenVocabulary" }, "Spanish" )
                xml.Description( { :LanguageId => "ISO639-3:eng", :Link => "" }, "working language" )
              }
            }
            xml.Keys {
              xml.Key( { :Name => "Topic", :Type => "OpenVocabulary" }, "Ethnobiology" )
              session_type = ssn[:Name].split( /_/ )[1]
              keywords[session_type].each do |kywd|
                xml.Key( { :Name => "Keyword", :Type => "OpenVocabulary" }, kywd )
              end
            }
          }
          if ( ssn[:Actors].nil? )
            $stderr.puts "no Actors in #{ssn[:Name]}"
          else
            xml.Actors {
              ssn[:Actors].each do |actr|
                xml.Actor {
                  xml.Role( { :Link => "http://www.mpi.nl/IMDI/Schema/Actor-Role.xml", :Type => "OpenVocabularyList" }, actr[:Role] )
                  xml.Name actr[:Name]
                  xml.FullName actr[:Name]
                  prsn = persons[actr[:Name]]
                  xml.Code prsn[:Code]
                  xml.FamilySocialRole( :Link => "http://www.mpi.nl/IMDI/Schema/Actor-FamilySocialRole.xml", :Type => "OpenVocabularyList" )
                  xml.Languages
                  origin = prsn[:Origin].dup
                  origin.prepend( "Village of origin: " ) unless origin == "U.S."
                  xml.EthnicGroup( { :Type => "OpenVocabulary" }, origin )
                  age = ( prsn[:Birthdate] =~ /\d/ ) ? ( ssn[:Date][0..3].to_i - prsn[:Birthdate].to_i ).to_s : "Unspecified"
                  xml.Age age.dup
                  xml.BirthDate prsn[:Birthdate]
                  xml.Sex( { :Link => "http://www.mpi.nl/IMDI/Schema/Actor-Sex.xml", :Type => "ClosedVocabulary" }, prsn[:Sex] )
                  xml.Education
                  xml.Anonymized( { :Link => "http://www.mpi.nl/IMDI/Schema/Boolean.xml", :Type => "ClosedVocabulary" }, "false" )
                  xml.Keys
                }
              end
            }
          end
        }
        xml.Resources {
          xml.MediaFile( :ResourceId => "" ) {
            xml.ResourceLink( { :ArchiveHandle => "" }, "#{ssn[:Name]}.wav" )
            xml.Type( { :Link => "http://www.mpi.nl/IMDI/Schema/MediaFile-Type.xml", :Type => "ClosedVocabulary" }, "audio" )
            xml.Format( { :Link => "http://www.mpi.nl/IMDI/Schema/MediaFile-Format.xml", :Type => "OpenVocabulary" }, "audio/x-wav" )
            xml.Size ssn[:Size]
            xml.Quality( { :Type => "ClosedVocabulary" }, "5" )
            xml.RecordingConditions
            xml.TimePosition {
              xml.Start "Unspecified"
            }
            xml.Access {
              xml.Availability( { :Type => "OpenVocabulary" }, "U" )
              xml.Date
              xml.Owner
              xml.Publisher
              xml.Contact
            }
            xml.Description( { :LanguageId => "ISO639-3:eng", :Link => "", :Name => "Description" }, 
                             "Duration: #{ssn[:Duration]}; Sampling rate / bit depth: #{ssn[:Rec_format]}" )
            xml.Keys {
              xml.Key( { :Name => "Status", :Type => "OpenVocabulary" }, "complete" )
              xml.Key( { :Name => "RecordingEquipment", :Type => "OpenVocabulary" }, "Recorder: #{ssn[:Rec_machine]}" )
              xml.Key( { :Name => "RecordingEquipment", :Type => "OpenVocabulary" }, "Microphone: #{ssn[:Rec_mike]}" )
            }
          }
          xml.WrittenResource( :ResourceId => "" ) {
            xml.ResourceLink( { :ArchiveHandle => "" }, "#{ssn[:Name]}.eaf" )
            xml.MediaResourceLink
            xml.Date "2017-05-08"
            xml.Type( { :Link => "http://www.mpi.nl/IMDI/Schema/WrittenResource-Type.xml", :Type => "OpenVocabulary" }, "ELAN" )
            xml.SubType( :Link => "http://www.mpi.nl/IMDI/Schema/WrittenResource-SubType.xml", :Type => "OpenVocabularyList" )
            xml.Format( :Link => "http://www.mpi.nl/IMDI/Schema/WrittenResource-Format.xml", :Type => "OpenVocabulary" )
            xml.Size( :Type => "OpenVocabulary" )
            xml.Validation {
              xml.Type( :Link => "http://www.mpi.nl/IMDI/Schema/Validation-Type.xml", :Type => "ClosedVocabulary" )
              xml.Methodology( :Link => "http://www.mpi.nl/IMDI/Schema/Validation-Methodology.xml", :Type => "ClosedVocabulary" )
            }
            xml.Derivation( :Link => "http://www.mpi.nl/IMDI/Schema/WrittenResource-Derivation.xml", :Type => "ClosedVocabulary" )
            xml.CharacterEncoding "UTF-8"
            xml.ContentEncoding
            xml.LanguageId( :Type => "OpenVocabulary" )
            xml.Anonymized( { :Link => "http://www.mpi.nl/IMDI/Schema/Boolean.xml", :Type => "ClosedVocabulary" }, "false" )
            xml.Access {
              xml.Availability( { :Type => "OpenVocabulary" }, "U" )
              xml.Date
              xml.Owner
              xml.Publisher
              xml.Contact
            }
            xml.Keys {
              xml.Key( { :Name => "Status", :Type => "OpenVocabulary" }, "in progress" )
            }
          }
        }
      }
    }
  end
  fname = "imdi/MDP0272_#{ssn[:FileID]}.imdi"
  File.open( fname, "w" ){|ofh| ofh.puts( xml_o.to_xml( :encoding => 'UTF-8' ))}
end