#!/usr/bin/env ruby # Read a pair of xml files containing database dumps of one or more # "fn_soundGroup" elements and related "personnel" info, and write a # corresponding set of xml files containing selected information in # conformance with the "ISLE Meta Data Initiatve" (IMDI) standard for # describing "multi-media and multi-modal language resources." For # details on IMDI, consult https://tla.mpi.nl/imdi-metadata/ ("The # Language Archive at the Max Planck Institute for # Psycholinguistics"). # Output files are created in a directory called "./imdi" (which is # created if necessary); file names are based on the "Project/ID" # Script originally created by David Graff; Version 3: May 13, 2017. # This is free software; you may copy, modify and redistribute at # will, provided that you maintain mention of the original author, and # provide commentary in the source code to describe the changes you # make (if any) to the version you received. require 'nokogiri' unless ARGV.size == 2 and File.file?( ARGV[0] ) and File.file?( ARGV[1] ) $stderr.puts "Usage: #{$0} personnel_data.xml session_data.xml" exit 1 end Dir.mkdir( "imdi" ) unless File.directory?( "imdi") persfile, sessfile = ARGV[0..1] persons = Hash.new{|h,k| h[k] = Hash.new} # persons_by_name["last_name, first_name"] = { :code => ..., :birthdate => ... } keywords = { 'Cazas' => [ 'Ethnobiology', 'Ethnobotany', 'Economic botany', 'Plants', 'Hunting and Fishing', 'Material culture' ], 'Botan' => [ 'Ethnobiology', 'Ethnobotany', 'Economic botany', 'Plants', 'Plant names', 'Plant uses' ], 'Comid' => [ 'Ethnobiology', 'Ethnobotany', 'Economic botany', 'Plants', 'Plant names', 'Plant uses', 'Food preparation', 'Edible plants' ], 'MatCl' => [ 'Ethnobiology', 'Ethnobotany', 'Economic botany', 'Plants', 'Plant names', 'Plant uses', 'Material culture', 'Tools' ], 'Medic' => [ 'Ethnobiology', 'Ethnobotany', 'Economic botany', 'Plants', 'Plant names', 'Plant uses', 'Disease', 'Medicinal plants', 'Sickness', 'Health' ], } xml_p = Nokogiri::XML( File.open( persfile )) lname, fname = nil, nil xml_p.xpath( '/database/last_nameGroup' ).each do |pgrp| pgrp.element_children.each do |pchild| ptag = pchild.name ptxt = pchild.text if ( ptag == "last_name" ) lname = ptxt elsif ( ptag == "first_name" ) fname = ptxt elsif ( ptag == "code" ) persons["#{fname} #{lname}"][:Code] = ptxt elsif ( ptag == "birthdate" ) persons["#{fname} #{lname}"][:Birthdate] = ( ptxt =~ /(\d{4})/ ) ? $1 : "Unspecified" end end end sessions = [] xml_i = Nokogiri::XML( File.open( sessfile )) xml_i.xpath( '/database/fn_soundGroup' ).each do |fnsgrp| fnsgrp.element_children.each do |fchild| nname = fchild.name ntext = fchild.text if ( nname == "fn_sound" ) # this is always the first child node fid = ntext.match( /[_-](20\d\d-\d\d-\d\d-[a-z])/ )[1] sessions.push( { :FileID => fid, :Name => ntext.sub( /.wav$/, "" ) } ) elsif ( nname == "titspn" ) sessions[-1][:Title] = ntext elsif ( nname == "descrip" ) sessions[-1][:Descrip] = ntext elsif ( nname.match( /^(duration|size|date|rec_[fm]\w+)/ )) lbl = nname.capitalize.to_sym sessions[-1][lbl] = ntext elsif ( nname.match( /contr(\d)/ )) cnum = $1 sessions[-1][:Actors] = [] if cnum == "1" sessions[-1][:Actors].push( Hash.new ) aname = ntext.sub( /^(.*), (.*)/ ){ "#{$2} #{$1}" } sessions[-1][:Actors][-1][:Name] = aname $stderr.puts "Unknown person: #{aname}" unless ( persons.has_key?( aname )) elsif ( nname.match( /con(\d)_role/ )) sessions[-1][:Actors][$1.to_i - 1][:Role] = ntext elsif ( nname.match( /con(\d)_(\w+)/ )) cnum = $1.to_i - 1 clbl = $2.capitalize.to_sym if ( persons[ sessions[-1][:Actors][cnum][:Name] ].has_key?( clbl )) if ( persons[ sessions[-1][:Actors][cnum][:Name] ][clbl] != ntext ) $stderr.puts "discrepancy: FileID=%s Name=/%s/ Label=/%s/: %s vs. %s" % [ sessions[-1][:FileID], sessions[-1][:Actors][cnum][:Name], clbl.to_s, ntext, persons[sessions[-1][:Actors][cnum][:Name]][clbl] ] end else persons[ sessions[-1][:Actors][cnum][:Name] ][clbl] = ntext end elsif ( nname == "genreGroup" ) sessions[-1][:Keys] = [] fchild.element_children.each do |cchild| if ( cchild.name == 'subgenre' ) sessions[-1][:Keys] = cchild.text.split( /\s*\p{Punct}+\s*/ ) end end end end end sessions.each do |ssn| xml_o = Nokogiri::XML::Builder.new do |xml| xml.METATRANSCRIPT( { 'ArchiveHandle' => "", 'Date' => "2016-01-29", 'FormatId' => "IMDI 3.03", 'Originator' => "", 'Type' => "SESSION", 'Version' => "0", 'NS2:schemaLocation' => "http://www.mpi.nl/IMDI/Schema/IMDI ./IMDI_3.0.xsd", 'xmlns' =>"http://www.mpi.nl/IMDI/Schema/IMDI", 'xmlns:NS2' => "http://www.w3.org/2001/XMLSchema-instance" } ) { xml.Session { xml.Name ssn[:Name] xml.Title ssn[:Title] xml.Date ssn[:Date] xml.Description( { :LanguageId => "ISO639-3:spa", :Link => "" }, ssn[:Descrip] ) xml.MDGroup { xml.Location { xml.Continent "Middle America" xml.Country "Mexico" xml.Region "Sierra Nororiental de Puebla" } xml.Project { xml.Name "0259-IPF0178" xml.Title "Documentation of Nahuat Knowledge of Natural History, Material Culture, Medicine, Hunting and Fishing, and Ecology" xml.Id "MDP0272" xml.Contact { xml.Name "Jonathan D. Amith" xml.Email "nahuatl.biology@gmail.com" } } xml.Keys { xml.Key( { :Name => "SessionId", :Type => "OpenVocabulary" }, ssn[:Title] ) xml.Key( { :Name => "CorpusId", :Type => "OpenVocabulary" }, "0292" ) } xml.Content { xml.Genre "Discourse" subg = ( ssn[:Actors].size == 1 ) ? "Description" : ( ssn[:Actors][1][:Role] == 'Interviewer' ) ? "Interview" : "Conversation" xml.SubGenre( { :Link => "http://www.mpi.nl/IMDI/Schema/Content-SubGenre.xml", :Type => "OpenVocabularyList" }, subg ) xml.CommunicationContext xml.Languages { xml.Language { xml.Id "ISO639:azz" xml.Name( { :Link => "http://www.mpi.nl/IMDI/Schema/MPI-Languages.xml", :Type => "OpenVocabulary" }, "Sierra Nororiental de Puebla Nahuat" ) xml.Description( { :LanguageId => "ISO639-3:eng", :Link => "" }, "subject language" ) } xml.Language { xml.Id "ISO639:eng" xml.Name( { :Link => "http://www.mpi.nl/IMDI/Schema/MPI-Languages.xml", :Type => "OpenVocabulary" }, "English" ) xml.Description( { :LanguageId => "ISO639-3:eng", :Link => "" }, "working language" ) } xml.Language { xml.Id "ISO639:spa" xml.Name( { :Link => "http://www.mpi.nl/IMDI/Schema/MPI-Languages.xml", :Type => "OpenVocabulary" }, "Spanish" ) xml.Description( { :LanguageId => "ISO639-3:eng", :Link => "" }, "working language" ) } } xml.Keys { xml.Key( { :Name => "Topic", :Type => "OpenVocabulary" }, "Ethnobiology" ) session_type = ssn[:Name].split( /_/ )[1] keywords[session_type].each do |kywd| xml.Key( { :Name => "Keyword", :Type => "OpenVocabulary" }, kywd ) end } } if ( ssn[:Actors].nil? ) $stderr.puts "no Actors in #{ssn[:Name]}" else xml.Actors { ssn[:Actors].each do |actr| xml.Actor { xml.Role( { :Link => "http://www.mpi.nl/IMDI/Schema/Actor-Role.xml", :Type => "OpenVocabularyList" }, actr[:Role] ) xml.Name actr[:Name] xml.FullName actr[:Name] prsn = persons[actr[:Name]] xml.Code prsn[:Code] xml.FamilySocialRole( :Link => "http://www.mpi.nl/IMDI/Schema/Actor-FamilySocialRole.xml", :Type => "OpenVocabularyList" ) xml.Languages origin = prsn[:Origin].dup origin.prepend( "Village of origin: " ) unless origin == "U.S." xml.EthnicGroup( { :Type => "OpenVocabulary" }, origin ) age = ( prsn[:Birthdate] =~ /\d/ ) ? ( ssn[:Date][0..3].to_i - prsn[:Birthdate].to_i ).to_s : "Unspecified" xml.Age age.dup xml.BirthDate prsn[:Birthdate] xml.Sex( { :Link => "http://www.mpi.nl/IMDI/Schema/Actor-Sex.xml", :Type => "ClosedVocabulary" }, prsn[:Sex] ) xml.Education xml.Anonymized( { :Link => "http://www.mpi.nl/IMDI/Schema/Boolean.xml", :Type => "ClosedVocabulary" }, "false" ) xml.Keys } end } end } xml.Resources { xml.MediaFile( :ResourceId => "" ) { xml.ResourceLink( { :ArchiveHandle => "" }, "#{ssn[:Name]}.wav" ) xml.Type( { :Link => "http://www.mpi.nl/IMDI/Schema/MediaFile-Type.xml", :Type => "ClosedVocabulary" }, "audio" ) xml.Format( { :Link => "http://www.mpi.nl/IMDI/Schema/MediaFile-Format.xml", :Type => "OpenVocabulary" }, "audio/x-wav" ) xml.Size ssn[:Size] xml.Quality( { :Type => "ClosedVocabulary" }, "5" ) xml.RecordingConditions xml.TimePosition { xml.Start "Unspecified" } xml.Access { xml.Availability( { :Type => "OpenVocabulary" }, "U" ) xml.Date xml.Owner xml.Publisher xml.Contact } xml.Description( { :LanguageId => "ISO639-3:eng", :Link => "", :Name => "Description" }, "Duration: #{ssn[:Duration]}; Sampling rate / bit depth: #{ssn[:Rec_format]}" ) xml.Keys { xml.Key( { :Name => "Status", :Type => "OpenVocabulary" }, "complete" ) xml.Key( { :Name => "RecordingEquipment", :Type => "OpenVocabulary" }, "Recorder: #{ssn[:Rec_machine]}" ) xml.Key( { :Name => "RecordingEquipment", :Type => "OpenVocabulary" }, "Microphone: #{ssn[:Rec_mike]}" ) } } xml.WrittenResource( :ResourceId => "" ) { xml.ResourceLink( { :ArchiveHandle => "" }, "#{ssn[:Name]}.eaf" ) xml.MediaResourceLink xml.Date "2017-05-08" xml.Type( { :Link => "http://www.mpi.nl/IMDI/Schema/WrittenResource-Type.xml", :Type => "OpenVocabulary" }, "ELAN" ) xml.SubType( :Link => "http://www.mpi.nl/IMDI/Schema/WrittenResource-SubType.xml", :Type => "OpenVocabularyList" ) xml.Format( :Link => "http://www.mpi.nl/IMDI/Schema/WrittenResource-Format.xml", :Type => "OpenVocabulary" ) xml.Size( :Type => "OpenVocabulary" ) xml.Validation { xml.Type( :Link => "http://www.mpi.nl/IMDI/Schema/Validation-Type.xml", :Type => "ClosedVocabulary" ) xml.Methodology( :Link => "http://www.mpi.nl/IMDI/Schema/Validation-Methodology.xml", :Type => "ClosedVocabulary" ) } xml.Derivation( :Link => "http://www.mpi.nl/IMDI/Schema/WrittenResource-Derivation.xml", :Type => "ClosedVocabulary" ) xml.CharacterEncoding "UTF-8" xml.ContentEncoding xml.LanguageId( :Type => "OpenVocabulary" ) xml.Anonymized( { :Link => "http://www.mpi.nl/IMDI/Schema/Boolean.xml", :Type => "ClosedVocabulary" }, "false" ) xml.Access { xml.Availability( { :Type => "OpenVocabulary" }, "U" ) xml.Date xml.Owner xml.Publisher xml.Contact } xml.Keys { xml.Key( { :Name => "Status", :Type => "OpenVocabulary" }, "in progress" ) } } } } } end fname = "imdi/MDP0272_#{ssn[:FileID]}.imdi" File.open( fname, "w" ){|ofh| ofh.puts( xml_o.to_xml( :encoding => 'UTF-8' ))} end