Tryag File Manager
Home
-
Turbo Force
Current Path :
/
proc
/
self
/
root
/
home
/
cluster1
/
data
/
bu01
/
1121861
/
logs
/
mmaxwell
/
Upload File :
New :
File
Dir
//proc/self/root/home/cluster1/data/bu01/1121861/logs/mmaxwell/makefile
#Nahuatl makefile. #Fix: non-Unicode chars in here! ifndef NAHUATL_MAKEFILE NAHUATL_MAKEFILE = Languages/Nahuatl/makefile #-------------Definitions-------- #The ff. are mostly so make doesn't complain about undefined variables # (has to be before 'include's): LANGUAGE=Nahuatl LANGUAGE_UC=NAHUATL #Defns which are dependencies in supra-makefiles must go here. # (Defns which are used in the recipes themselves can go later.) PARSER=Parser.fsm #This uses the liberal version of the phonological rules, # e.g. it allows k-deletion (or k --> ') to be relatively unconstrained. # (See the file PhonologyRules.xfst, in particular the ifdef's.) GENERATOR=Generator.fsm #This uses the conservative version of the phonological rules, # e.g. the k-deletion rules are fairly restricted. This allows us # to avoid too much ambiguity in generating forms, but it may not # parse all valid forms in text. # RX_FILE is not defined, because I can't figure out the structure # of the dictionary's SFM codes. #Get defns from parent makefile: ifdef THIS_DIR THIS_DIR:=$(THIS_DIR)/.. else THIS_DIR:=$(CURDIR) endif include $(THIS_DIR)/../makefile DIALECT = OAPAN #Re-define for other dialects ENCODING = "UTF-8" GEN_PARADIGMS_SCHEMA = ParadigmDefn.xsd #XML Schema file to use in validating XML paradigm files GEN_PARADIGM = $(PYTHON) $(LANGUAGES_DIR)/GenParadigm.py #Generates the paradigm of a single word, provided on the cmd line GEN_PARADIGMS = $(PYTHON) $(LANGUAGES_DIR)/GenParadigms.py $(DEBUG_GEN_PARADIGMS) #Generates the paradigm of a list of words DEBUG_GEN_PARADIGMS = #Setting DEBUG_GEN_PARADIGMS to "-d" will cause the above GEN_PARADIGMS to output debug info CLEAN_FILES = *.fsm *.lkp *.out *.parse *.pyc LoadRoots.xfst $(LOCAL_LEX) lexc/* \ ParserFSMs/* GeneratorFSMs/* utf8/* TestSuite/*.html #Some intermediate .in files are created in the dir TestSuite, but they are not included here # because the file TestSuite/2VerbTenses.in is hand-built, and because the intermediate .in # files are automatically deleted by 'make'. We do NOT clean the UTF-8 and XML versions of # the dictionary, because the latter requires cranking up Toolbox. LEXICON_FILES = Dictionary.sfm UTF8_LEX = Dictionary.utf8 XML_LEX = Dictionary.xml # An XML version of the Nahuatl dictionary, as produced by Shoebox/ Toolbox. # (Most of the Shoebox fields are superfluous for purposes of morphological parsing, # so this XML version may only contain some of the fields.) LEXC_FILES = ./lexc/Adj.lexc \ ./lexc/N.lexc \ ./lexc/N^Inalienable.lexc \ ./lexc/N^N1.lexc \ ./lexc/N^N1dom.lexc \ ./lexc/N^N1N2.lexc \ ./lexc/N^N2.lexc \ ./lexc/N^ObligPoss.lexc \ ./lexc/N^PartWhole.lexc \ ./lexc/N^PartWholeOnly.lexc \ ./lexc/Uninflected.lexc \ ./lexc/V0.lexc \ ./lexc/V1.lexc \ ./lexc/V2.lexc \ ./lexc/V3.lexc \ ./lexc/V4.lexc XFST_FILES = ./ParserFSMs/AdjInflection.xfst \ ./ParserFSMs/LoadRoots.xfst \ ./ParserFSMs/NounInflection.xfst \ ./ParserFSMs/NounStems.xfst \ ./ParserFSMs/Parser.xfst \ ./ParserFSMs/PhonologyClasses.xfst \ ./ParserFSMs/PhonologyRules.xfst \ ./ParserFSMs/Reduplication.xfst \ ./ParserFSMs/SurfaceForms.xfst \ ./ParserFSMs/VerbInflection.xfst \ ./ParserFSMs/VerbPrefixes.xfst \ ./ParserFSMs/VerbStems.xfst \ ./ParserFSMs/VerbSuffixes.xfst TESTSUITE_OUT = TestSuite/VerbTable.out \ TestSuite/2VerbTenses.out \ TestSuite/SpecialTests.out \ TestSuite/komin.out \ TestSuite/nimkin.out \ TestSuite/Reduplication.out \ TestSuite/Nouns.out \ TestSuite/AdjectivePlurals.out \ TestSuite/Diminutives.out \ TestSuite/NounPlurals.out \ TestSuite/VerbParadigms.html \ TestSuite/VerbStemForms.html \ TestSuite/aw.html \ TestSuite/NounParadigms.html \ TestSuite/SpanishNounParadigms.html \ TestSuite/AdjParadigms.html \ INSTALL_DIR = $(LORAX_ROOT)/spd25/htdocs/hyperlex2/nahuatl/ #Jonathan's dir for Nahuatl stuff NETWORK_FILES = $(PARSER) $(GENERATOR) SR2Gl.lkp SR2UR.lkp Gl2UR.lkp GlBr2UR.lkp \ UR2SR.lkp UR2Gl.lkp #All the 'compiled' networks, both those used for interactive lookup # (= $(PARSER) and $(GENERATOR)) and those used for batch-style lookup # (= the .lkp files). ZIP_FILES = *.xfst *.py *.xml *.xsd *.xsl makefile ../makefile \ TestSuite/*.in TestSuite/*.list TestSuite/*.Defn.xml TestSuite/EntityDefns.xml \ ../GenParadigm.py #Source files to be zipped up and sent to some backup facility XFST_FLAGS = -q -utf8 -e "set verbose OFF" -e "set quit-on-fail ON" #Jonathan's dictionary has ff. 8-bit chars in the \lxo field: # acute 'a' and 'A' (once for upper case, but does not appear to be a typo) # acute 'e' # acute 'i' # hat 'i' (once, probably a typo) # acute 'o' # acute 'u' #We convert the dictionary and the xfst files to UTF-8 (see recipe for LOCAL_LEX) # before loading them. (I've kept the xfst files in ISO-8859-1 to so I can use # a fixed-width font.) #Choice of tokenizers: XFST_TOKENIZER = $(XTOKENIZER) NahTokenize.fsm #This has special handling for the 'ma' clitic. TR_TOKENIZER = $(TR) -s "\".,!?()[]{}-��]; \r\n\t" "\n" #Fix: non-Unicode chars! omit in favor of XFST_TOKENIZER? #Break at white space and most punctuation (but not colon, which marks length). # Punctuation is turned into newlines (i.e. ignored). This does _not_ do the right # thing with the 'ma' clitic, which is written separately but needs to be tokenized # with the word that follows. The XFST_TOKENIZER does the right thing. #PED_TOKENIZER = $(PED) -r cp1252 -w utf8 \ # -e 's/([��\(�>])/$$1\n/g' \ # -e 's/(["\"\.,\!\?\){}\-�;< ])/\n$$1/g' PED_TOKENIZER = $(ICONV) -f cp1252 -t utf8 \ | $(SED) -e 's/([��\(�>])/$$1\n/g' \ -e 's/(["\"\.,\!\?\){}\-�;< ])/\n$$1/g' #Similar to TR_TOKENIZER, except retains the punctuation tokens. Re-worked # to use sed rather than ped, but not tested. Square brackets have been # removed earlier (they contain phonemes omitted in fast speech). #Fix: non-Unicode chars! omit in favor of XFST_TOKENIZER? NAH_TOKENIZER = $(XFST_TOKENIZER) LOWER_CASE = $(TR) "A-Z" "a-z" #Override the defn in the next dir up, to include conversion to UTF-8 # (otherwise 'ped' messes up): NORMALIZE_SFMS = \ REC_SEP=$(IN_REC_SEP) ; \ $(CAT) $(LEXICON_FILES) \ | $(TR) -s "\t\r " " " \ | $(ICONV) -f ISO-8859-1 -t UTF-8 \ | $(ONE_REC_PER_LINE) \ | $(ONE_FLD_PER_LINE) POS_SFM = psm .PHONY: all Stems WordAnalyses Debug .PRECIOUS: TestSuite/%.xml #The SFMs whose content we need to extract from the Shoebox lexicon # (not including the leading backslash): IN_REC_SEP = ref LEMMA_SFM = lxoa #Oapan primary lemma UR_SFM = lxoa_pr #Oapan UR (if present, used instead of primary lemma as underlying form) POS_SFM = psm #POS VERB_PARADIGM_SFM = infv #Paradigm class for verbs NOUN_SUBCLASS_SFM = infn #Subclasses for nouns (telling whether they are optionally or obligatorily possessed, etc.) ALLOMORPH_SFM = allomorph #Contains code for allomorph properties. Never more than one code, although at least one # is a two-word code: "object deletion". A few have an explicit allomorph, but this may # be removed later. NA_LEMMA = "----" #Code Jonathan uses in fields which are NA (?) #------------Dependencies-------- #Parser and generator differ in dependencies only in their respective subdirectories. #PhonologyClasses dependence is automatic, since it depends only on its # corresponding .xfst file #PhonologyRules: GeneratorFSMs/PhonologyRules.fsm: GeneratorFSMs/PhonologyRules.xfst GeneratorFSMs/PhonologyClasses.xfst ParserFSMs/PhonologyRules.fsm: ParserFSMs/PhonologyRules.xfst ParserFSMs/PhonologyClasses.xfst #Reduplication: GeneratorFSMs/Reduplication.fsm: GeneratorFSMs/Reduplication.xfst GeneratorFSMs/PhonologyClasses.fsm ParserFSMs/Reduplication.fsm: ParserFSMs/Reduplication.xfst ParserFSMs/PhonologyClasses.fsm #Adjectives: GeneratorFSMs/AdjInflection.fsm: GeneratorFSMs/AdjInflection.xfst GeneratorFSMs/PhonologyClasses.fsm ParserFSMs/AdjInflection.fsm: ParserFSMs/AdjInflection.xfst ParserFSMs/PhonologyClasses.fsm #Nouns: GeneratorFSMs/NounStems.fsm: GeneratorFSMs/NounStems.xfst GeneratorFSMs/PhonologyClasses.fsm GeneratorFSMs/LoadRoots.xfst ParserFSMs/NounStems.fsm: ParserFSMs/NounStems.xfst ParserFSMs/PhonologyClasses.fsm ParserFSMs/LoadRoots.xfst # NounStems.xfst doesn't actually load the file 'LoadRoots.xfst', but it does load # the various noun lexc files that are created when LoadRoots.xfst is created. GeneratorFSMs/NounInflection.fsm: GeneratorFSMs/NounInflection.xfst GeneratorFSMs/NounStems.fsm GeneratorFSMs/PhonologyClasses.fsm ParserFSMs/NounInflection.fsm: ParserFSMs/NounInflection.xfst ParserFSMs/NounStems.fsm ParserFSMs/PhonologyClasses.fsm #Verbs: GeneratorFSMs/VerbStems.fsm: GeneratorFSMs/VerbStems.xfst GeneratorFSMs/LoadRoots.xfst GeneratorFSMs/PhonologyClasses.fsm \ GeneratorFSMs/Reduplication.fsm ParserFSMs/VerbStems.fsm: ParserFSMs/VerbStems.xfst ParserFSMs/LoadRoots.xfst ParserFSMs/PhonologyClasses.fsm \ ParserFSMs/Reduplication.fsm # VerbStems.xfst doesn't actually load the file 'LoadRoots.xfst', but it does load # the various verbal lexc files that are created when LoadRoots.xfst is created. GeneratorFSMs/VerbPrefixes.fsm: GeneratorFSMs/VerbPrefixes.xfst ParserFSMs/VerbPrefixes.fsm: ParserFSMs/VerbPrefixes.xfst GeneratorFSMs/VerbSuffixes.fsm: GeneratorFSMs/VerbSuffixes.xfst ParserFSMs/VerbSuffixes.fsm: ParserFSMs/VerbSuffixes.xfst GeneratorFSMs/VerbInflection.fsm: GeneratorFSMs/VerbInflection.xfst GeneratorFSMs/VerbStems.fsm \ GeneratorFSMs/VerbPrefixes.fsm GeneratorFSMs/VerbSuffixes.fsm \ GeneratorFSMs/PhonologyClasses.fsm ParserFSMs/VerbInflection.fsm: ParserFSMs/VerbInflection.xfst ParserFSMs/VerbStems.fsm \ ParserFSMs/VerbPrefixes.fsm ParserFSMs/VerbSuffixes.fsm \ ParserFSMs/PhonologyClasses.fsm #SpellingRelaxationRules exists only as a parser: ParserFSMs/SpellingRelaxationRules.fsm: ParserFSMs/SurfaceForms.fsm ParserFSMs/SpellingRelaxationRules.xfst #SurfaceForms (complete parser): GeneratorFSMs/SurfaceForms.xfst: SurfaceForms.xfst #GeneratorFSMs/SurfaceForms.fsm is not actually built, but we list it here anyway... GeneratorFSMs/SurfaceForms.fsm: GeneratorFSMs/SurfaceForms.xfst GeneratorFSMs/VerbInflection.fsm GeneratorFSMs/NounInflection.fsm \ GeneratorFSMs/AdjInflection.fsm GeneratorFSMs/PhonologyClasses.fsm GeneratorFSMs/PhonologyRules.fsm ParserFSMs/SurfaceForms.xfst: SurfaceForms.xfst ParserFSMs/SurfaceForms.fsm: ParserFSMs/SurfaceForms.xfst ParserFSMs/VerbInflection.fsm ParserFSMs/NounInflection.fsm \ ParserFSMs/AdjInflection.fsm ParserFSMs/PhonologyClasses.fsm ParserFSMs/PhonologyRules.fsm #------------------------------------------------ #-------------------Recipes---------------------- #------------------------------------------------ #---------Building parser and generator---------- all: Stems.out WordAnalyses.out Debug.out $(UTF8_LEX): $(LEXICON_FILES) #Convert to UTF-8, so Toolbox can read it without silently deleting ISO chars. $(ICONV) -f ISO-8859-1 -t UTF-8 < $^ > $@ $(XML_LEX): Dictionary.utf8 #This cannot be produced by makefile, only by the Toolbox GUI. # But if the xml version is not newer than the UTF-8 version, # we can at least signal an error: $(ECHO) "Cannot proceed; the XML version needs to be produced from the UTF8 version by Toolbox." exit 1 $(LOCAL_LEX): $(LEXICON_FILES) #LOCAL_LEX is defined in the Languages makefile as NormalizeLex.db; # the local defn for NORMALIZE_SFMS converts the dictionary entries # to UTF-8. $(BANNER) $(NORMALIZE_SFMS) \ > $@ GeneratorFSMs/%.xfst: %.xfst #Create a version of the source code which only allows obligatory phonological rules, # plus a subset of the optional ones. Also don't include certain rare suffixes (like -tia). # At present, we don't actually use the gpp macro name 'GENERATOR', but we have # it here just in case... $(BANNER) # First create a UTF-8 version of the file in question: $(CAT) $< \ | $(ICONV) -f ISO-8859-1 -t UTF-8 \ | $(GPP) -D GENERATOR -D $(DIALECT) \ > GeneratorFSMs/$< GeneratorFSMs/%.fsm: GeneratorFSMs/%.xfst # Then compile it: cd GeneratorFSMs ; $(XFST) -e 'source $(notdir $<)' -e 'save defined $(notdir $@)' -stop ParserFSMs/%.xfst: %.xfst #Create a version of the source code that allows both obligatory and optional phonological rules, # and includes all suffixes (even the rare ones, like -tia). $(BANNER) # First create a UTF-8 version of the file in question: $(CAT) $< \ | $(ICONV) -f ISO-8859-1 -t UTF-8 \ | $(GPP) -D PARSER -D $(DIALECT) \ > ParserFSMs/$< ParserFSMs/%.fsm: ParserFSMs/%.xfst # Then compile it: cd ParserFSMs ; $(XFST) -e 'source $(notdir $<)' -e 'save defined $(notdir $@)' -stop $(GENERATOR): GeneratorFSMs/SurfaceForms.xfst GeneratorFSMs/PhonologyClasses.fsm \ GeneratorFSMs/PhonologyRules.fsm GeneratorFSMs/VerbInflection.fsm \ GeneratorFSMs/NounInflection.fsm GeneratorFSMs/AdjInflection.fsm #Build the generator, which allows only the strict version of some phonological rules. $(BANNER) $(CAT) GeneratorFSMs/SurfaceForms.xfst \ | $(ICONV) -f ISO-8859-1 -t UTF-8 \ | $(GPP) -D GENERATOR -D $(DIALECT) \ > GeneratorFSMs/Generator.xfst cd GeneratorFSMs ; $(XFST) -e 'source Generator.xfst' -e 'save defined ../$@' -stop $(PARSER): ParserFSMs/SurfaceForms.xfst ParserFSMs/PhonologyClasses.fsm \ ParserFSMs/PhonologyRules.fsm ParserFSMs/VerbInflection.fsm \ ParserFSMs/NounInflection.fsm ParserFSMs/AdjInflection.fsm #Build the parser, which allows variant phonological rules. $(BANNER) $(CAT) ParserFSMs/SurfaceForms.xfst \ | $(ICONV) -f ISO-8859-1 -t UTF-8 \ | $(GPP) -D PARSER -D $(DIALECT) \ > ParserFSMs/Parser.xfst cd ParserFSMs ; $(XFST) -e 'source Parser.xfst' -e 'save defined ../$@' -stop TextsToParse/%.utf8: TextsToParse/%.txt #Convert a paragraph-formatted text into a tokenized text, including conversion # to UTF-8. Steps: # 1) Convert any tab chars or CRs to a (single) space char, and any sequence # of space chars to a single space char. # 2) Temporarily convert existing newlines (which mark paragraphs) to tabs. # 3) Get rid of any footnote markers. These are either a digit (and this seems # to be the only use of digits), or a code of the form [JDA<n>], where <n> # is a sequence of digits (only one in the data, but we handle multiple digits). # 4) Get rid of any remaining square brackets, which surround phonemes which # were omitted in fast speech. # 5) Tokenize. # 6) Convert the tab chars from step (2) to a sequence of two newlines. # 7) Get rid of the space chars left over from tokenization (all the other # token separators, e.g. punctuation, need to be retained). # 8) Convert upper case to lower case. @$(BANNER) @ $(CAT) $< \ | $(TR) -s '\t\r ' ' ' \ | $(TR) '\n' '\t' \ | $(TR) -d '0123456789' \ | $(SED) -e 's/\[JDA\]//g' \ | $(TR) -d '[]' \ | $(PED_TOKENIZER) \ | $(SED) -e 's/\t/\n\n/g' \ | $(TR) -d ' ' \ | $(LOWER_CASE) \ > $@ TextsToParse/%.parse: TextsToParse/%.utf8 SR2Gl.lkp #Parse a pre-tokenized text. (Apparently 'lookup' introduces CRs, which we # dutifully remove...) @$(BANNER) $(CAT) $< \ | $(LOOKUP) -utf8 -flags TT SR2Gl.lkp \ | $(TR) -d '\r' \ > $@ TextsToParse/UnparsedTypes.utf8: TextsToParse/*.parse #Collect a list of all the types (not tokens) which fail to parse, omitting # punctuation and things between <spn>...</spn> tags. # Steps: # (1) Select all the lines that don't parse (these are marked by '+?'). # (2) Get rid of all but the first column, which contains the input word. # (3) Put everything on one line... # (4) ...so we can remove any sequences consisting of things inside <spn>...</spn> # tags (including removing the tags). # (5) Break the tokens back up into one per line. # (6) Sort the tokens uniquely, giving types. # (7) Get rid of any punctuation tokens (but 'grep' doesn't recognize non-ASCII # punctuation). @$(BANNER) $(FGREP) "+?" $^ \ | $(CUT) -f 1 \ | $(TR) -s " \n" " " \ | $(SED) -e "s%<spn>[^<]*</spn>%%g" \ | $(TR) " " "\n" \ | $(GREP) -v "^[[:punct:]]$$" \ | $(SORT) -u \ > $@ #--------------Testing------------- #The following are used to create two levels of a three-level interlinear format; # they are intended for use with the 'lookup' program, and are mnemonically labeled # with the input on the left and the output on the right (and a '2' in between # as a reminder that these 'lookup' files only work in one direction). # These are used to generate paradigm forms for regression testing. SR2Gl.lkp: $(PARSER) #Parse the surface form into its glosses. @$(BANNER) $(XFST) -e 'load defined $<' -e 'read regex GlSR;' -e 'save stack $@' -stop SR2UR.lkp: $(PARSER) #Parse the surface form into an underlying form. @$(BANNER) $(XFST) -e 'load defined $<' -e 'read regex URSR;' -e 'save stack $@' -stop Gl2UR.lkp: $(PARSER) #Generate the underlying form from the gloss form. @$(BANNER) $(XFST) -e 'load defined $<' -e 'read regex GlUR;' -e 'invert net' -e 'save stack $@' -stop GlBr2UR.lkp: $(PARSER) #Generate the underlying form from the gloss form (but where the gloss forms # have square brackets around the stems): @$(BANNER) $(XFST) -e 'load defined $<' -e 'read regex GlBrUR;' -e 'invert net' -e 'save stack $@' -stop UR2SR.lkp: $(PARSER) #Generate the surface form from the underlying form. @$(BANNER) $(XFST) -e 'load defined $<' -e 'read regex URSR;' -e 'invert net' -e 'save stack $@' -stop UR2Gl.lkp: $(PARSER) #Relate a seq of underlying forms to their glosses. @$(BANNER) $(XFST) -e 'load defined $<' -e 'read regex GlUR;' -e 'save stack $@' -stop ParadigmClasses: $(LOCAL_LEX) #Construct a list of unique paradigm classes, for verbs and nouns (sorted separately). # Lists the (unique) contents of the verbal paradigm class field (\infv) and the noun # paradigm class field (\infn). $(BANNER) # First verbs: $(ECHO) "------Verb Paradigm classes------" \ > $@ FIELDS=$(VERB_PARADIGM_SFM) ; \ $(EXTRACT_FIELDS) \ | $(SORT) -u \ >> $@ # ...a blank line: $(ECHO) \ >> $@ # Then nouns: $(ECHO) "------Noun Paradigm classes------" \ >> $@ FIELDS=$(NOUN_SUBCLASS_SFM) ; \ $(EXTRACT_FIELDS) \ | $(SORT) -u \ >> $@ ParadigmCounts: ParadigmClasses #Construct a list of unique paradigm classes, each followed by a (tab separated) count. # The list is sorted from most common to least common within verbs, and then # (separately) within nouns. ##FIX: not working; I get 0 counts on many classes, which is impossible. # The problem is probably in ../CountStrings.py, which is treating the strings as # regexs, so that special chars like "<" get funny treatment. $(BANNER) # First verbs: $(ECHO) "------Verb Paradigm counts------" \ > $@ RXPREFIX="\\$(VERB_PARADIGM_SFM)\ " ; \ RXSUFFIX="$$" ; \ $(COUNTS) # ...a blank line: $(ECHO) \ >> $@ # Then nouns; $(ECHO) "------Noun Paradigm counts------" \ >> $@ RXPREFIX="\\$(NOUN_SUBCLASS_SFM)\ " ; \ RXSUFFIX="$$" ; \ $(COUNTS) LoadRoots.xfst: $(XML_LEX) NahuatlCodes.py #Extract the lex entries for the roots from the XML version of the lexicon. # One lexc file is created in a subdir for each inflected POS, plus one # lexc file for all uninflected POSs together, plus the target file containing # the xfst code to load those lexc files. We also produce a summary count # of all lexical errors. # The data is extracted from an XML/ UTF-8 version of the dictionary. @$(BANNER) # Remove all the existing files in the lexc directory, rather than just # overwriting them, because if a category is removed from the list of inflected # categories, we don't want to leave its corresponding lexc file laying around: -rm lexc/* # Now create the new lexc files, and the target file. Toolbox used to replace # the '<' and '>' of Jonathan's embedded XML tags with < and >, and this # is necessary in order for Python to interpret them for our purposes. Toolbox # seems no longer to do this, so we use 'sed' to do it. If Toolbox reverts to # replacing them, the 'sed' op will be a no-op (and therefore safe). cd lexc ; \ $(CAT) ../$(XML_LEX) \ | $(SED) -e "s/<klamoa>/\<klamoa\>/" \ -e "s%</klamoa>%\</klamoa\>%" \ | $(PYTHON) -O ../NahTab2MultLexc.py -c ../NahuatlCodes.py -e LoadRootsErrors.txt \ > ../$@ # Output a summary of the lexicon warnings, sorted according to general type (done # by deleting the specifics of each msg, namely everything after a colon or single quote), # plus a count for each type: @$(MAGENTA); $(ECHO) "----------Summary of lexicon warnings follows-----------" > /dev/tty ; $(BLACK) @$(GREEN) @ $(CAT) lexc/LoadRootsErrors.txt \ | $(GREP) Warning \ | $(SED) -e"s/^ *Warning\: //" \ -e"s/[\:\x27].*$$//" \ | $(SORT) \ | $(UNIQ) -c \ | $(TEE) lexc/LoadRootsSummary.txt # Send summary to both screen and file; latter is in RCS @$(MAGENTA); $(ECHO) "--------------------------------------------------------" > /dev/tty ; $(BLACK) Stems.out: VerbStems.fsm #Output the Nahuatl verb stems sorted by class. The class is assumed to be the # second item (-k 2) on the line, using '^' to separate items; the first item # is an identifier of the stem type, like "SHORT:", some spaces, and the lexeme. # This allows us to sort short and long stems of the same lexeme (actually, class) together. # Finally, we use 'sed' to remove the class names, and then use sed's 'G' command to # stick in a newline after each SHORT stem (the short stems for some reason wind up last). # Can't use the -f flag to read in the file--have to use "-e source <fname>". @$(BANNER) @$(XFST) -e "source VerbStems.xfst" -e "read regex LabeledVStem;" -e "print lower-words;" -stop \ | $(SORT) -k 2 -t'^' \ | $(SED) -e "s/\^C.*$$//" \ | $(SED) '/SHORT/{G;};' \ > $@ Debug.out: Debug.xfst $(GENERATOR) #Run various diagnostics, including upper and lower alphabets, check for unmatched flag diacritics. @$(BANNER) $(XFST) -e "source $<" -stop \ > $@ # Also check for any unmatched flag diacritics, which would appear inside a "<...>" # in the following (the leading "-" is because if there are no unmatched flags, # grep will signal an error). We put the output in a temp file... - $(XFST) -e "load defined VerbInflection.fsm" \ -e "read regex UnderlyingFormUR;" \ -e "label net" \ -e "set show-flags ON" \ -e "print words" \ -stop \ | $(GREP) "@" \ | $(GREP) "<" \ > $(TMP)/$@ # ...then check whether we got anything. If the above file is of size > 0, # then the two greps must have found s.t., namely a one-sided flag diacritic; # add this to the target file: @if test -s $(TMP)/$@ ; then \ $(ECHO) >> $@ ; \ $(ECHO) "Warning: One-sided flag diacritics:" >> $@ ; \ cat $(TMP)/$@ >> $@ ; \ $(ERROR) "Warning: One-sided flag diacritics, see $@ for details." ; \ else \ $(ECHO) "One-sided flag diacritic test passed successfully." >> $@ ; \ fi LowerAlphabet.txt: $(GENERATOR) #Create a list of the symbols in the lower side alphabet, not counting # the diacritic flags. NB: There is a space char that shows up inside # double quotes. The space gets turned into a newline, leaving the two # double quotes, which we grep out to avoid confusion. $(XFST) -e "loadd $<" \ -e "regex GlSR .l ;" \ -e "print labels" \ -stop \ | $(GREP) -v "Size" \ | $(TR) " " "\n" \ | $(GREP) -v "@" \ | $(GREP) -v '"' \ > $@ -$(RCSDIFF) /cygdrive/c/RCS/C/Data/Data/Languages/Nahuatl/$@,v $@ UpperAlphabet.txt: $(GENERATOR) #Create a list of the symbols in the upper side alphabet, not counting # the diacritic flags. NB: There is a space char that shows up inside # double quotes. The space gets turned into a newline, leaving the two # double quotes, which we grep out to avoid confusion. $(XFST) -e "loadd $<" \ -e "regex GlSR .u ;" \ -e "print labels" \ -stop \ | $(GREP) -v "Size" \ | $(TR) " " "\n" \ | $(GREP) -v "@" \ | $(GREP) -v '"' \ > $@ -$(RCSDIFF) /cygdrive/c/RCS/C/Data/Data/Languages/Nahuatl/$@,v $@ GlossTable.txt: UpperAlphabet.txt ExpandGlosses.lkp #Create a table of all the affixal glosses, and add a second column with # the gloss abbreviations expanded. Used for the DOE tool. # Unclear why 'lookup' puts tabs between leading dashes and rest of output, # so we use 'sed' to remove these (and 'tr' to remove extraneous space chars, # and 'grep' to remove the blank lines 'lookup' inserts between records). $(CAT) $< \ | $(TR) " " "\n" \ | $(GREP) "[\=\+\-]" \ | $(LOOKUP) ExpandGlosses.lkp \ | $(SED) -e"s/- /-/" \ | $(TR) -d " " \ | $(GREP) -v "^$$" \ 1> $@ 2> /dev/null #-------------------Test Suite---------------- TestSuite/VerbTable.out: TestSuite/VerbTable.in $(GENERATOR) #Probably a temporary recipe, to generate the verb table for Jonathan's demo. # Pass it throught dos2unix to get rid of the CRs that xfst introduces. $(XFST) -f $< \ | $(DOS2UNIX) \ > $@ TestSuite/%.out: TestSuite/%.in $(PARSER) #This set of test forms is based on the Word doc '2 Verb tenses.doc'. # It may be superceded at some point by the XML paradigm generators below. # Passing it through DOS2UNIX seems to be necessary, else xfst inserts CRLFs. $(BANNER) $(XFST) -q \ -e "loadd $(PARSER)" \ -e "regex GlSR; " \ -e "source $<" \ -stop \ | $(DOS2UNIX) \ > $@ TestSuite/%.in: TestSuite/%.Defn.xml TestSuite/%.Stems.list TestSuite/EntityDefns.xml # Validate the XML file listing the paradigm definitions, then # use it to generate the xfst input file (.in) from the list of stems+POSs. $(BANNER) # Use xsltproc to pre-process the paradigm defn file to include some entities: -rm $(TMP)/$(notdir $<) cd TestSuite \ ; $(XSLTPROC) $(NULL_STYLESHEET) $(notdir $<) > $(TMP)/$(notdir $<) # Make sure the result is a valid paradigm defn file: @$(ECHO) Validating paradigms... $(XMLLINT) --noout --noent --schema $(GEN_PARADIGMS_SCHEMA) $(TMP)/$(notdir $<) # If we get here, TestSuite/%.Defn.xml passed the validation; # use it (or rather, the copy in /tmp) to generate the xfst input file: $(GEN_PARADIGMS) -e 'UTF-8' -f $(basename $@).Stems.list -p $(TMP)/$(notdir $<) -r " .o. Lex" -o $@ TestSuite/%.xml: TestSuite/%.in $(PARSER) #Tell the parser (not the generator) to generate the desired forms for # the specified test suite input file. The forms are output to an XML file, # which contains three columns: Surface form--Underlying form--Gloss. ##FIX: Another kludge: the .GlURSR file lacks the square brackets around the root, # so we use sed to remove the square brackets on the roots here (the square # brackets are originally put there by GenParadigms.py; maybe we should just # eliminate them there??) $(BANNER) # First create an XML file with just the gloss forms, as generated # from the "features". The .in file that we "source" uses the upper side # of the 'Lex' variable. Since there is some ambiguity going from the upper # (gloss) side to the lower (UR) side, because of identical glosses for the same # affixal morpheme ICO dialectal variation etc., we would get multiple but # identical gloss strings. So we take the upper side of the Lex var, thereby # avoiding this unwanted ambiguity. $(XFST) -e "loadd $(PARSER)" \ -e "regex Lex .u;" \ -e "define Lex" \ -e "source $<" \ -stop \ | $(SED) -e "s/\[//" -e "s/\]//" \ > $(TMP)/$(notdir $(basename $@)).xml # Now take the words that were generated, unique them... $(CAT) $(TMP)/$(notdir $(basename $@)).xml \ | $(FGREP) -v "<" \ | $(SORT) -u \ > $(TMP)/$(notdir $(basename $@)).words # ...and get all three forms (SR, UR, Gl): $(MAKE) $(TMP)/$(notdir $(basename $@)).GlURSR # Now merge these parses back into the XML file (the '-' arg on xsltproc means "read from stdin"). # Kludge: Unclear why, but we get many duplicate lines if we don't run it through 'uniq'. These # don't seem to correspond to a real ambiguity in the parser/ generator. $(PYTHON) SR2Parses.py -u -x $(TMP)/$(notdir $(basename $@)).xml -p $(TMP)/$(notdir $(basename $@)).GlURSR \ | $(XSLTPROC) SortParadigms.xslt - \ | $(UNIQ) \ > $@ TestSuite/%.html: TestSuite/%.xml #Convert the XML version of a test suite output into an HTML file. $(BANNER) $(XSLTPROC) ParadigmXML2HTML.xsl $< \ > $@ TestAll: $(GENERATOR) $(PARSER) #Test a representative set of data. # The dependencies on $(GENERATOR) and $(PARSER) are strictly superfluous, because # the individual files in $(TESTSUITE_OUT) have one or the other as dependencies. # However, it's cleaner to put them as dependencies for this target, because this # prevents 'make' from repeatedly trying to make them (once for each file in # $(TESTSUITE_OUT)) if they cannot be made. # NB: in order to create an html diff file of the .html output to send to Jonathan, # do a ComponentSoftware RCS diff of the html file, then do 'File | Save as HTML'. # Then edit the resulting html-diff file by changing all the '<' to '<', # and '>' to '>'. Can't automate(?), because there is no way to create the html-diff # with the fancy yellow color coding and the pop-up navigation window in batch mode. for fname in $(TESTSUITE_OUT) ; do \ $(MAKE) $$fname ; \ done NahTokenize.fsm: NahTokenize.xfst #Don't use the Tokenize.fsm in the generic makefile. $(BANNER) # Convert to UTF-8: $(ICONV) -f ISO-8859-1 -t UTF-8 < $< > utf8/$(notdir $<) # ...then run xfst (the $(XFST) var includes the value of XFST_FLAGS, which tells # xfst to assume UTF-8 encoding): $(XFST) -e 'source utf8/$(notdir $<)' -stop %.parse: %.txt SR2Gl.lkp NahTokenize.fsm #Run the parser over an input text. #WARNING: This uses ISO-8859-1. If you need UTF-8, then first convert the # .txt file to .utf8, so it can use the %.utf8-->%.parse recipe above. # The Xerox tokenizer is used instead of the tr tokenizer, because it allows # us to tokenize the clitic 'ma' together with the following word (which had # better be a verb). The output format of the target file is a series of records, # each representing a non-punctuation token of the input, with records separated # by a blank line; within a record, each line represents a parse (but if there # were no parses, the single line of the record contains a '+?'). $(BANNER) # Tokenize into two temp files, which we can look at for debugging, # word counts etc. The first is a normalized version of the original # text, one lower-cased word per line, omitting punctuation: $(CAT) $< \ | $(NAH_TOKENIZER) \ | $(LOWER_CASE) \ > $(TMP)/$(notdir $(basename $@)).norm # ...and the second is a list of unique words in that normalized text: $(CAT) $(TMP)/$(notdir $(basename $@)).norm \ | $(SORT) -u \ > $(TMP)/$(notdir $(basename $@)).words # Then parse both files. The first one (original but normalized text, # then parsed) becomes the target file of this recipe: $(CAT) $(TMP)/$(notdir $(basename $@)).norm \ | $(LOOKUP) -utf8 -flags TT SR2Gl.lkp \ > $@ # ...and the second, containing the unique words and their parses # and failures, goes to another file: $(CAT) $(TMP)/$(notdir $(basename $@)).words \ | $(LOOKUP) -utf8 -flags TT SR2Gl.lkp \ | $(GREP) -v "^[[:space:]]$$" \ > $(basename $@).words.parse # Collect some statistics: @$(ECHO) Word counts: @$(ECHO) Unique words in text = `$(CAT) $(TMP)/$(notdir $(basename $@)).words | $(WC) -l` @$(ECHO) Unique words parsed = `$(CAT) $(basename $@).words.parse \ | $(FGREP) -v "+?" \ | $(CUT) -f 1 \ | $(UNIQ) \ | $(WC) -l` %.parse.xml: %.txt $(PARSER) SR2UR.lkp UR2Gl.lkp NahTokenize.fsm #Run the parser over an input text, and produce an XML interlinear version. # The output format of the target file is an XML file, containing a sequence of records: # <parserOutput> # <parses form=�foos�> # <parse morphemes=�foo+s� glosses=�house+PL�/> # <parse morphemes=�f+oo+s� glosses=�1SgS+run+PAST�/> # </parses> # <parses form=�,�> # <parse morphemes=�,� glosses=�,� /> # </parses> # <parses form=�baring�> # </parses> # </parserOutput> $(BANNER) # Tokenize into two temp files, which we can look at for debugging, # word counts etc. The first is a tokenized and lower-cased version # of the original text, one lower-cased token per line: ##FIX: Need to deal with xml tags in input, including <spanish>foo</spanish>, # and use of ellipsis. $(CAT) $< \ | $(NAH_TOKENIZER) \ | $(LOWER_CASE) \ | $(TR) -d "\r" \ > $(TMP)/$(notdir $(basename $@)).norm # ...and the second is a list of unique words in that normalized text: $(CAT) $(TMP)/$(notdir $(basename $@)).norm \ | $(SORT) -u \ > $(TMP)/$(notdir $(basename $@)).words # ...and convert this into a single file, with tab-delimited # columns for Gl-UR-SR: $(MAKE) $(TMP)/$(notdir $(basename $@)).GlURSR # Now we're ready to read in the normalized text file, and output an XML file: $(PYTHON) Text2Parses.py -p $(TMP)/$(notdir $(basename $@)).GlURSR \ > $@ %.GlURSR: %.words $(PARSER) Gl2UR.lkp UR2SR.lkp #Convert a .words file containing a list of Nahuatl parses in their gloss form, one per line, # into a .GlURSR file. # The latter (output) file contains three tab delimited columns, with the Gloss parse ( # Gl) in the first column, the Underlying Representation (UR) in the second column, and # the original Surface Representation (SR) in the third column. # (Used by both the TestSuite files and for parsing a file of Nahuatl text.) $(BANNER) # First use the gloss strings in the .words file to generate their underlying representation. # We discard the error stream output of 'lookup', because this is just a string # saying that it has loaded the .lkp file; we also discard any words that didn't generate # (that shouldn't happen, but...). $(CAT) $< \ | $(LOOKUP) -utf8 -flags TT Gl2UR.lkp \ 2> /dev/null \ | $(FGREP) -v "+?" \ | $(TR) -d "\r" \ | $(GREP) -v "^[[:space:]]*$$" \ > $(TMP)/$(notdir $(basename $@)).Gl2UR # For the words that correctly generated, we'll also generate their surface form # from their underlying form. # We're generating from their UR, because the stage from Gl to UR is ambiguous (when?) # But generating from UR is also ambiguous for C-final nouns, because the -inal.possd.sg # and the -al.possd.sg are homophonous (they only differ on V-final nouns). So we use # a kludge, and 'uniq' the result. $(CAT) $(TMP)/$(notdir $(basename $@)).Gl2UR \ | $(CUT) -f 2 \ | $(LOOKUP) -utf8 -flags TT UR2SR.lkp \ | $(GREP) -v "^[[:space:]]*$$" \ | $(UNIQ) \ > $(TMP)/$(notdir $(basename $@)).UR2SR # Now convert these two files into a single file, with tab-delimited # columns for Gl-UR-SR. The merging is done by the UR, since one # gloss form may generate > one underlying representation, and we want # to get the right one: $(PYTHON) MergeByUR.py -l $(TMP)/$(notdir $(basename $@)).Gl2UR -r $(TMP)/$(notdir $(basename $@)).UR2SR \ > $@ #-------------Backup, Delivery------------------ backup.tar.gz: $(ZIP_FILES) #Backup the source code files to a tar.gz file for xfer to a backup location. $(TAR) cfz $@ $^ $(GREEN) ; $(ECHO) "Backing up to LDC..." ; $(BLACK) $(ECHO) "put $@ /web/edu/upenn/ldc/nahuatl/xfst_dir/" \ | $(FTP) maxwell@login.ldc.upenn.edu $(GREEN); $(ECHO) "Backing up to Balsas..." ; $(BLACK) $(ECHO) "put $@ /home/cluster1/data/o/d/1121861/html/maxwell" \ | $(FTP) mmaxwell@www.balsas-nahuatl.org ldc: $(NETWORK_FILES) #Upload the parser, generator, and .lkp files to the LDC server. This is complicated # by the fact that all the 'sftp' cmds that operate on the file system (like rm, ls, # put) take only a single arg. So things have to be done with separate cmds, using # loops. We therefore build a file of commands (easier than echoing all the stuff # in the command line): $(ECHO) "cd /web/edu/upenn/ldc/nahuatl/xfst_dir/" > $(TMP)/ftpcmds.txt # Then create a 'rm' cmd and a 'put' cmd for each file (if we don't first rm them, # 'put' won't overwrite them): for fn in $(NETWORK_FILES) ; do \ $(ECHO) "rm $$fn" >> $(TMP)/ftpcmds.txt ; \ $(ECHO) "put $$fn" >> $(TMP)/ftpcmds.txt ; \ done # Add two commands to list the results, to make sure it worked # (need separate cmds for .fsm and .lkp, because the ftp version of 'ls' # only takes a single arg): $(ECHO) "ls -l *.fsm" >> $(TMP)/ftpcmds.txt $(ECHO) "ls -l *.lkp" >> $(TMP)/ftpcmds.txt # ...and do it (can't use the -b arg on sftp, because then you have to use # non-interactive authorization, rather than a password): $(CAT) $(TMP)/ftpcmds.txt | $(FTP) maxwell@login.ldc.upenn.edu balsas: $(PARSER) $(GENERATOR) #Upload the parser and generator to Balsas (= the http site). When the xfer # is done, we do an 'ls' to verify that the files got there. # The '\n' in the commands below are newlines, not part of directory names! ls -l *.fsm $(ECHO) -e "cd html/programmers\nput $(PARSER)\nput $(GENERATOR)\nls -l *.fsm" \ | $(FTP) mmaxwell@ftp.balsas-nahuatl.org DOE.tar.gz: $(PARSER) #Create the deliverable for the DOE grant as a tar.gz file, and send it to the LDC. # Includes: # The compiled parser # The parser (but not generator) versions of the Oapan xfst files (like # the source .xfst files, but gpp has already been run to remove generator # versions of rules and any non-Oapan stuff) # The UTF-8 XML (not SFM) lexicon # The Python files to convert the UTF-8 lexicon into the *.lexc files # The *.lexc files # First clear out anything in the DOE dir: -rm -r DOE mkdir DOE # ...then start populating it: # Unicode XML lexicon: As of 27 June 2009, we do not send this, because Jonathan's # XML dictionary is adequate (and correctly converts the '<' and '>' of embedded # XML tags to '<' and '>', which for some reason Toolbox does not). ## cp Dictionary.xml DOE/ # Oapan parser source code: mkdir DOE/XfstFiles cp ParserFSMs/*.xfst DOE/XfstFiles # Python converter files: cp NahTab2MultLexc.py NahuatlCodes.py DOE/ # Lexc files are created from the XML lexicon, so we only create their directory: mkdir DOE/lexc # Shell scripts (and Dave Graff's Perl script) to create the executable and parse: cp CompileParser.sh ParseText.sh ParseText.pl DOE # xfst and lookup networks: xfst -utf8 -q -e 'load defined Parser.fsm' -e 'read regex URSR;' \ -e 'save stack UR2SR.lkp' -stop xfst -utf8 -q -e 'load defined Parser.fsm' -e 'read regex GlBrUR;' \ -e 'save stack GlBr2UR.lkp' -stop cp $(PARSER) UR2SR.lkp GlBr2UR.lkp DOE # the Python file for merging the output of the two .lkp networks: cp MergeByUR.py DOE # ...and a list of glosses (taken from Jonathan's '00 Glosses.doc' Word file): cp Glosses.txt DOE # Tar and gzip this directory, and send it to the LDC: $(TAR) cfz $@ DOE $(GREEN) ; $(ECHO) "Sending DOE deliverable to LDC..." ; $(BLACK) $(ECHO) "put $@ /web/edu/upenn/ldc/nahuatl/xfst_dir/" \ | $(FTP) maxwell@login.ldc.upenn.edu report: #Create a report from the RCS log of all hand-produced files and their change comments # during the specified time period. Unfortunately, it's not possible to limit # the report to those files that have actually been altered during that time. $(ECHO) -n "Input day before first date for report (in format YYYY/MM/DD): " \ ; read START_DATE \ ; $(ECHO) -n "Input day after last date for report (in format YYYY/MM/DD): " \ ; read END_DATE \ ; $(RLOG) -d$$START_DATE'"<"'$$END_DATE $(CUR_RCS_DIR)/*.xfst,v \ $(CUR_RCS_DIR)/*.py,v \ $(CUR_RCS_DIR)/makefile,v \ $(CUR_RCS_DIR)/TestSuite/* \ > Report.txt NahuatlGrammar.tar.gz: $(LEXC_FILES) $(XFST_FILES) #Just the xfst and lexc files; does not include makefiles. $(TAR) czf $@ $^ endif #ifndef NAHUATL_MAKEFILE