Tryag File Manager

//proc/self/root/home/cluster1/data/bu01/1121861/logs/mmaxwell/makefile

#Nahuatl makefile.
#Fix: non-Unicode chars in here!
ifndef NAHUATL_MAKEFILE
NAHUATL_MAKEFILE = Languages/Nahuatl/makefile

#-------------Definitions--------
#The ff. are mostly so make doesn't complain about undefined variables
# (has to be before 'include's):
LANGUAGE=Nahuatl
LANGUAGE_UC=NAHUATL

#Defns which are dependencies in supra-makefiles must go here.
# (Defns which are used in the recipes themselves can go later.)
PARSER=Parser.fsm
  #This uses the liberal version of the phonological rules,
  # e.g. it allows k-deletion (or k --> ') to be relatively unconstrained.
  # (See the file PhonologyRules.xfst, in particular the ifdef's.)
GENERATOR=Generator.fsm
  #This uses the conservative version of the phonological rules,
  # e.g. the k-deletion rules are fairly restricted.  This allows us
  # to avoid too much ambiguity in generating forms, but it may not
  # parse all valid forms in text.
# RX_FILE is not defined, because I can't figure out the structure
# of the dictionary's SFM codes.

#Get defns from parent makefile:
ifdef THIS_DIR
	THIS_DIR:=$(THIS_DIR)/..
else
	THIS_DIR:=$(CURDIR)
endif
include $(THIS_DIR)/../makefile

DIALECT = OAPAN
  #Re-define for other dialects
ENCODING = "UTF-8"
GEN_PARADIGMS_SCHEMA = ParadigmDefn.xsd
    #XML Schema file to use in validating XML paradigm files
GEN_PARADIGM  = $(PYTHON) $(LANGUAGES_DIR)/GenParadigm.py
  #Generates the paradigm of a single word, provided on the cmd line
GEN_PARADIGMS = $(PYTHON) $(LANGUAGES_DIR)/GenParadigms.py $(DEBUG_GEN_PARADIGMS)
  #Generates the paradigm of a list of words
DEBUG_GEN_PARADIGMS =
    #Setting DEBUG_GEN_PARADIGMS to "-d" will cause the above GEN_PARADIGMS to output debug info

CLEAN_FILES = *.fsm *.lkp *.out *.parse *.pyc LoadRoots.xfst $(LOCAL_LEX) lexc/* \
              ParserFSMs/* GeneratorFSMs/* utf8/* TestSuite/*.html
  #Some intermediate .in files are created in the dir TestSuite, but they are not included here
  # because the file TestSuite/2VerbTenses.in is hand-built, and because the intermediate .in
  # files are automatically deleted by 'make'.  We do NOT clean the UTF-8 and XML versions of
  # the dictionary, because the latter requires cranking up Toolbox.
LEXICON_FILES = Dictionary.sfm
UTF8_LEX = Dictionary.utf8
XML_LEX = Dictionary.xml
#   An XML version of the Nahuatl dictionary, as produced by Shoebox/ Toolbox.
#   (Most of the Shoebox fields are superfluous for purposes of morphological parsing,
#   so this XML version may only contain some of the fields.)
LEXC_FILES = ./lexc/Adj.lexc \
             ./lexc/N.lexc \
             ./lexc/N^Inalienable.lexc \
             ./lexc/N^N1.lexc \
             ./lexc/N^N1dom.lexc \
             ./lexc/N^N1N2.lexc \
             ./lexc/N^N2.lexc \
             ./lexc/N^ObligPoss.lexc \
             ./lexc/N^PartWhole.lexc \
             ./lexc/N^PartWholeOnly.lexc \
             ./lexc/Uninflected.lexc \
             ./lexc/V0.lexc \
             ./lexc/V1.lexc \
             ./lexc/V2.lexc \
             ./lexc/V3.lexc \
             ./lexc/V4.lexc
XFST_FILES = ./ParserFSMs/AdjInflection.xfst \
             ./ParserFSMs/LoadRoots.xfst \
             ./ParserFSMs/NounInflection.xfst \
             ./ParserFSMs/NounStems.xfst \
             ./ParserFSMs/Parser.xfst \
             ./ParserFSMs/PhonologyClasses.xfst \
             ./ParserFSMs/PhonologyRules.xfst \
             ./ParserFSMs/Reduplication.xfst \
             ./ParserFSMs/SurfaceForms.xfst \
             ./ParserFSMs/VerbInflection.xfst \
             ./ParserFSMs/VerbPrefixes.xfst \
             ./ParserFSMs/VerbStems.xfst \
             ./ParserFSMs/VerbSuffixes.xfst

TESTSUITE_OUT = TestSuite/VerbTable.out \
                TestSuite/2VerbTenses.out \
                TestSuite/SpecialTests.out \
                TestSuite/komin.out \
                TestSuite/nimkin.out \
                TestSuite/Reduplication.out \
                TestSuite/Nouns.out \
                TestSuite/AdjectivePlurals.out \
                TestSuite/Diminutives.out \
                TestSuite/NounPlurals.out \
                TestSuite/VerbParadigms.html \
                TestSuite/VerbStemForms.html \
                TestSuite/aw.html \
                TestSuite/NounParadigms.html \
                TestSuite/SpanishNounParadigms.html \
                TestSuite/AdjParadigms.html \

INSTALL_DIR = $(LORAX_ROOT)/spd25/htdocs/hyperlex2/nahuatl/
	#Jonathan's dir for Nahuatl stuff
NETWORK_FILES = $(PARSER) $(GENERATOR) SR2Gl.lkp SR2UR.lkp Gl2UR.lkp GlBr2UR.lkp \
                UR2SR.lkp UR2Gl.lkp
	#All the 'compiled' networks, both those used for interactive lookup
	# (= $(PARSER) and $(GENERATOR)) and those used for batch-style lookup
	# (= the .lkp files).

ZIP_FILES = *.xfst *.py *.xml *.xsd *.xsl makefile ../makefile \
            TestSuite/*.in TestSuite/*.list TestSuite/*.Defn.xml  TestSuite/EntityDefns.xml \
            ../GenParadigm.py
	#Source files to be zipped up and sent to some backup facility

XFST_FLAGS = -q -utf8 -e "set verbose OFF" -e "set quit-on-fail ON"
  #Jonathan's dictionary has ff. 8-bit chars in the \lxo field:
  # acute 'a' and 'A' (once for upper case, but does not appear to be a typo)
  # acute 'e'
  # acute 'i'
  # hat 'i' (once, probably a typo)
  # acute 'o'
  # acute 'u'
  #We convert the dictionary and the xfst files to UTF-8 (see recipe for LOCAL_LEX)
  # before loading them.  (I've kept the xfst files in ISO-8859-1 to so I can use
  # a fixed-width font.)
#Choice of tokenizers:
XFST_TOKENIZER = $(XTOKENIZER) NahTokenize.fsm
  #This has special handling for the 'ma' clitic.
TR_TOKENIZER   = $(TR) -s "\".,!?()[]{}-��]; \r\n\t" "\n"
#Fix: non-Unicode chars! omit in favor of XFST_TOKENIZER?
  #Break at white space and most punctuation (but not colon, which marks length).
  # Punctuation is turned into newlines (i.e. ignored).  This does _not_ do the right
  # thing with the 'ma' clitic, which is written separately but needs to be tokenized
  # with the word that follows.  The XFST_TOKENIZER does the right thing.
#PED_TOKENIZER  = $(PED) -r cp1252 -w utf8 \
#                        -e 's/([��$�>])/$$1\n/g' \
#                        -e 's/(["\"\.,\!\?${}\-�;< ])/\n$$1/g'
PED_TOKENIZER  =   $(ICONV) -f cp1252 -t utf8 \
                 | $(SED) -e 's/([��$�>])/$$1\n/g' \
                          -e 's/(["\"\.,\!\?${}\-�;< ])/\n$$1/g'
  #Similar to TR_TOKENIZER, except retains the punctuation tokens.  Re-worked
  # to use sed rather than ped, but not tested.  Square brackets have been
  # removed earlier (they contain phonemes omitted in fast speech).
  #Fix: non-Unicode chars! omit in favor of XFST_TOKENIZER?
NAH_TOKENIZER  = $(XFST_TOKENIZER)
LOWER_CASE = $(TR) "A-Z" "a-z"
#Override the defn in the next dir up, to include conversion to UTF-8
# (otherwise 'ped' messes up):
NORMALIZE_SFMS = \
	REC_SEP=$(IN_REC_SEP) ; \
	  $(CAT) $(LEXICON_FILES) \
	| $(TR) -s "\t\r " " " \
	| $(ICONV) -f ISO-8859-1 -t UTF-8 \
	| $(ONE_REC_PER_LINE) \
	| $(ONE_FLD_PER_LINE)

POS_SFM = psm
.PHONY: all Stems WordAnalyses Debug
.PRECIOUS: TestSuite/%.xml

#The SFMs whose content we need to extract from the Shoebox lexicon
# (not including the leading backslash):
IN_REC_SEP = ref
LEMMA_SFM    = lxoa
  #Oapan primary lemma
UR_SFM       = lxoa_pr
  #Oapan UR (if present, used instead of primary lemma as underlying form)
POS_SFM      = psm
  #POS
VERB_PARADIGM_SFM = infv
  #Paradigm class for verbs
NOUN_SUBCLASS_SFM = infn
  #Subclasses for nouns (telling whether they are optionally or obligatorily possessed, etc.)
ALLOMORPH_SFM = allomorph
  #Contains code for allomorph properties.  Never more than one code, although at least one
  # is a two-word code: "object deletion".  A few have an explicit allomorph, but this may
  # be removed later.
NA_LEMMA = "----"
  #Code Jonathan uses in fields which are NA (?)

#------------Dependencies--------
#Parser and generator differ in dependencies only in their respective subdirectories.

#PhonologyClasses dependence is automatic, since it depends only on its
# corresponding .xfst file

#PhonologyRules:
GeneratorFSMs/PhonologyRules.fsm: GeneratorFSMs/PhonologyRules.xfst GeneratorFSMs/PhonologyClasses.xfst
ParserFSMs/PhonologyRules.fsm:       ParserFSMs/PhonologyRules.xfst    ParserFSMs/PhonologyClasses.xfst

#Reduplication:
GeneratorFSMs/Reduplication.fsm: GeneratorFSMs/Reduplication.xfst GeneratorFSMs/PhonologyClasses.fsm
ParserFSMs/Reduplication.fsm:       ParserFSMs/Reduplication.xfst    ParserFSMs/PhonologyClasses.fsm

#Adjectives:
GeneratorFSMs/AdjInflection.fsm:  GeneratorFSMs/AdjInflection.xfst GeneratorFSMs/PhonologyClasses.fsm
ParserFSMs/AdjInflection.fsm:        ParserFSMs/AdjInflection.xfst    ParserFSMs/PhonologyClasses.fsm

#Nouns:
GeneratorFSMs/NounStems.fsm:      GeneratorFSMs/NounStems.xfst GeneratorFSMs/PhonologyClasses.fsm GeneratorFSMs/LoadRoots.xfst
ParserFSMs/NounStems.fsm:            ParserFSMs/NounStems.xfst    ParserFSMs/PhonologyClasses.fsm ParserFSMs/LoadRoots.xfst
   # NounStems.xfst doesn't actually load the file 'LoadRoots.xfst', but it does load
   # the various noun lexc files that are created when LoadRoots.xfst is created.
GeneratorFSMs/NounInflection.fsm: GeneratorFSMs/NounInflection.xfst GeneratorFSMs/NounStems.fsm GeneratorFSMs/PhonologyClasses.fsm
ParserFSMs/NounInflection.fsm:       ParserFSMs/NounInflection.xfst    ParserFSMs/NounStems.fsm    ParserFSMs/PhonologyClasses.fsm

#Verbs:
GeneratorFSMs/VerbStems.fsm:      GeneratorFSMs/VerbStems.xfst GeneratorFSMs/LoadRoots.xfst GeneratorFSMs/PhonologyClasses.fsm \
                                  GeneratorFSMs/Reduplication.fsm
ParserFSMs/VerbStems.fsm:         ParserFSMs/VerbStems.xfst    ParserFSMs/LoadRoots.xfst    ParserFSMs/PhonologyClasses.fsm \
                                  ParserFSMs/Reduplication.fsm
   # VerbStems.xfst doesn't actually load the file 'LoadRoots.xfst', but it does load
   # the various verbal lexc files that are created when LoadRoots.xfst is created.
GeneratorFSMs/VerbPrefixes.fsm:   GeneratorFSMs/VerbPrefixes.xfst
ParserFSMs/VerbPrefixes.fsm:      ParserFSMs/VerbPrefixes.xfst
GeneratorFSMs/VerbSuffixes.fsm:   GeneratorFSMs/VerbSuffixes.xfst
ParserFSMs/VerbSuffixes.fsm:      ParserFSMs/VerbSuffixes.xfst
GeneratorFSMs/VerbInflection.fsm: GeneratorFSMs/VerbInflection.xfst  GeneratorFSMs/VerbStems.fsm \
                                  GeneratorFSMs/VerbPrefixes.fsm     GeneratorFSMs/VerbSuffixes.fsm \
                                  GeneratorFSMs/PhonologyClasses.fsm
ParserFSMs/VerbInflection.fsm:    ParserFSMs/VerbInflection.xfst    ParserFSMs/VerbStems.fsm \
                                  ParserFSMs/VerbPrefixes.fsm       ParserFSMs/VerbSuffixes.fsm \
                                  ParserFSMs/PhonologyClasses.fsm

#SpellingRelaxationRules exists only as a parser:
ParserFSMs/SpellingRelaxationRules.fsm: ParserFSMs/SurfaceForms.fsm ParserFSMs/SpellingRelaxationRules.xfst

#SurfaceForms (complete parser):
GeneratorFSMs/SurfaceForms.xfst:  SurfaceForms.xfst
#GeneratorFSMs/SurfaceForms.fsm is not actually built, but we list it here anyway...
GeneratorFSMs/SurfaceForms.fsm:   GeneratorFSMs/SurfaceForms.xfst  GeneratorFSMs/VerbInflection.fsm   GeneratorFSMs/NounInflection.fsm \
                                  GeneratorFSMs/AdjInflection.fsm  GeneratorFSMs/PhonologyClasses.fsm GeneratorFSMs/PhonologyRules.fsm
ParserFSMs/SurfaceForms.xfst:     SurfaceForms.xfst
ParserFSMs/SurfaceForms.fsm:      ParserFSMs/SurfaceForms.xfst     ParserFSMs/VerbInflection.fsm      ParserFSMs/NounInflection.fsm \
                                  ParserFSMs/AdjInflection.fsm     ParserFSMs/PhonologyClasses.fsm ParserFSMs/PhonologyRules.fsm

#------------------------------------------------
#-------------------Recipes----------------------
#------------------------------------------------

#---------Building parser and generator----------
all: Stems.out WordAnalyses.out Debug.out

$(UTF8_LEX): $(LEXICON_FILES)
#Convert to UTF-8, so Toolbox can read it without silently deleting ISO chars.
	$(ICONV) -f ISO-8859-1 -t UTF-8 < $^ > $@

$(XML_LEX): Dictionary.utf8
#This cannot be produced by makefile, only by the Toolbox GUI.
# But if the xml version is not newer than the UTF-8 version,
# we can at least signal an error:
	$(ECHO) "Cannot proceed; the XML version needs to be produced from the UTF8 version by Toolbox."
	exit 1

$(LOCAL_LEX): $(LEXICON_FILES)
#LOCAL_LEX is defined in the Languages makefile as NormalizeLex.db;
# the local defn for NORMALIZE_SFMS converts the dictionary entries
# to UTF-8.
	$(BANNER)
	$(NORMALIZE_SFMS) \
	> $@

GeneratorFSMs/%.xfst: %.xfst
#Create a version of the source code which only allows obligatory phonological rules,
# plus a subset of the optional ones.  Also don't include certain rare suffixes (like -tia).
# At present, we don't actually use the gpp macro name 'GENERATOR', but we have
# it here just in case...
	$(BANNER)
#   First create a UTF-8 version of the file in question:
	  $(CAT) $< \
	| $(ICONV) -f ISO-8859-1 -t UTF-8 \
	| $(GPP) -D GENERATOR -D $(DIALECT) \
	> GeneratorFSMs/$<

GeneratorFSMs/%.fsm: GeneratorFSMs/%.xfst
#   Then compile it:
	cd GeneratorFSMs ; $(XFST) -e 'source $(notdir $<)' -e 'save defined $(notdir $@)' -stop

ParserFSMs/%.xfst: %.xfst
#Create a version of the source code that allows both obligatory and optional phonological rules,
# and includes all suffixes (even the rare ones, like -tia).
	$(BANNER)
#   First create a UTF-8 version of the file in question:
	  $(CAT) $< \
	| $(ICONV) -f ISO-8859-1 -t UTF-8 \
	| $(GPP) -D PARSER -D $(DIALECT) \
	> ParserFSMs/$<

ParserFSMs/%.fsm: ParserFSMs/%.xfst
#   Then compile it:
	cd ParserFSMs ; $(XFST) -e 'source $(notdir $<)' -e 'save defined $(notdir $@)' -stop

$(GENERATOR): GeneratorFSMs/SurfaceForms.xfst  GeneratorFSMs/PhonologyClasses.fsm \
              GeneratorFSMs/PhonologyRules.fsm GeneratorFSMs/VerbInflection.fsm \
              GeneratorFSMs/NounInflection.fsm GeneratorFSMs/AdjInflection.fsm
#Build the generator, which allows only the strict version of some phonological rules.
	$(BANNER)
	  $(CAT) GeneratorFSMs/SurfaceForms.xfst \
	| $(ICONV) -f ISO-8859-1 -t UTF-8 \
	| $(GPP) -D GENERATOR -D $(DIALECT) \
	> GeneratorFSMs/Generator.xfst
	cd GeneratorFSMs ; $(XFST) -e 'source Generator.xfst' -e 'save defined ../$@' -stop

$(PARSER):       ParserFSMs/SurfaceForms.xfst     ParserFSMs/PhonologyClasses.fsm \
                 ParserFSMs/PhonologyRules.fsm    ParserFSMs/VerbInflection.fsm \
                 ParserFSMs/NounInflection.fsm    ParserFSMs/AdjInflection.fsm
#Build the parser, which allows variant phonological rules.
	$(BANNER)
	  $(CAT) ParserFSMs/SurfaceForms.xfst \
	| $(ICONV) -f ISO-8859-1 -t UTF-8 \
	| $(GPP) -D PARSER -D $(DIALECT) \
	> ParserFSMs/Parser.xfst
	cd ParserFSMs ; $(XFST) -e 'source Parser.xfst' -e 'save defined ../$@' -stop

TextsToParse/%.utf8: TextsToParse/%.txt
#Convert a paragraph-formatted text into a tokenized text, including conversion
# to UTF-8.  Steps:
#   1) Convert any tab chars or CRs to a (single) space char, and any sequence
#      of space chars to a single space char.
#   2) Temporarily convert existing newlines (which mark paragraphs) to tabs.
#   3) Get rid of any footnote markers.  These are either a digit (and this seems
#      to be the only use of digits), or a code of the form [JDA<n>], where <n>
#      is a sequence of digits (only one in the data, but we handle multiple digits).
#   4) Get rid of any remaining square brackets, which surround phonemes which
#      were omitted in fast speech.
#   5) Tokenize.
#   6) Convert the tab chars from step (2) to a sequence of two newlines.
#   7) Get rid of the space chars left over from tokenization (all the other
#      token separators, e.g. punctuation, need to be retained).
#   8) Convert upper case to lower case.
	@$(BANNER)
	@ $(CAT) $< \
	| $(TR) -s '\t\r ' ' ' \
	| $(TR) '\n' '\t' \
	| $(TR) -d '0123456789' \
	| $(SED) -e 's/\[JDA\]//g' \
	| $(TR) -d '[]' \
	| $(PED_TOKENIZER) \
	| $(SED) -e 's/\t/\n\n/g'  \
	| $(TR) -d ' ' \
	| $(LOWER_CASE) \
	> $@

TextsToParse/%.parse: TextsToParse/%.utf8 SR2Gl.lkp
#Parse a pre-tokenized text.  (Apparently 'lookup' introduces CRs, which we
# dutifully remove...)
	@$(BANNER)
	  $(CAT) $< \
	| $(LOOKUP) -utf8 -flags TT SR2Gl.lkp \
	| $(TR) -d '\r' \
	> $@

TextsToParse/UnparsedTypes.utf8: TextsToParse/*.parse
#Collect a list of all the types (not tokens) which fail to parse, omitting
# punctuation and things between <spn>...</spn> tags.
# Steps:
# (1) Select all the lines that don't parse (these are marked by '+?').
# (2) Get rid of all but the first column, which contains the input word.
# (3) Put everything on one line...
# (4) ...so we can remove any sequences consisting of things inside <spn>...</spn>
#     tags (including removing the tags).
# (5) Break the tokens back up into one per line.
# (6) Sort the tokens uniquely, giving types.
# (7) Get rid of any punctuation tokens (but 'grep' doesn't recognize non-ASCII
#     punctuation).
	@$(BANNER)
	  $(FGREP) "+?" $^ \
	| $(CUT) -f 1 \
	| $(TR) -s " \n" " " \
	| $(SED) -e "s%<spn>[^<]*</spn>%%g" \
	| $(TR) " " "\n" \
	| $(GREP) -v "^[[:punct:]]$$" \
	| $(SORT) -u \
	> $@

#--------------Testing-------------

#The following are used to create two levels of a three-level interlinear format;
# they are intended for use with the 'lookup' program, and are mnemonically labeled
# with the input on the left and the output on the right (and a '2' in between
# as a reminder that these 'lookup' files only work in one direction).
# These are used to generate paradigm forms for regression testing.
SR2Gl.lkp: $(PARSER)
#Parse the surface form into its glosses.
	@$(BANNER)
	$(XFST) -e 'load defined $<' -e 'read regex GlSR;' -e 'save stack $@' -stop

SR2UR.lkp: $(PARSER)
#Parse the surface form into an underlying form.
	@$(BANNER)
	$(XFST) -e 'load defined $<' -e 'read regex URSR;' -e 'save stack $@' -stop

Gl2UR.lkp: $(PARSER)
#Generate the underlying form from the gloss form.
	@$(BANNER)
	$(XFST) -e 'load defined $<' -e 'read regex GlUR;' -e 'invert net' -e 'save stack $@' -stop

GlBr2UR.lkp: $(PARSER)
#Generate the underlying form from the gloss form (but where the gloss forms
# have square brackets around the stems):
	@$(BANNER)
	$(XFST) -e 'load defined $<' -e 'read regex GlBrUR;' -e 'invert net' -e 'save stack $@' -stop

UR2SR.lkp: $(PARSER)
#Generate the surface form from the underlying form.
	@$(BANNER)
	$(XFST) -e 'load defined $<' -e 'read regex URSR;' -e 'invert net' -e 'save stack $@' -stop

UR2Gl.lkp: $(PARSER)
#Relate a seq of underlying forms to their glosses.
	@$(BANNER)
	$(XFST) -e 'load defined $<' -e 'read regex GlUR;' -e 'save stack $@' -stop

ParadigmClasses: $(LOCAL_LEX)
#Construct a list of unique paradigm classes, for verbs and nouns (sorted separately).
# Lists the (unique) contents of the verbal paradigm class field (\infv) and the noun
# paradigm class field (\infn).
	$(BANNER)
#   First verbs:
	  $(ECHO) "------Verb Paradigm classes------" \
	> $@
	FIELDS=$(VERB_PARADIGM_SFM) ; \
	  $(EXTRACT_FIELDS) \
	| $(SORT) -u \
	>> $@
#   ...a blank line:
	  $(ECHO) \
	>> $@
#   Then nouns:
	  $(ECHO) "------Noun Paradigm classes------" \
	>> $@
	FIELDS=$(NOUN_SUBCLASS_SFM) ; \
	  $(EXTRACT_FIELDS) \
	| $(SORT) -u \
	>> $@

ParadigmCounts: ParadigmClasses
#Construct a list of unique paradigm classes, each followed by a (tab separated) count.
# The list is sorted from most common to least common within verbs, and then
# (separately) within nouns.
##FIX: not working; I get 0 counts on many classes, which is impossible.
# The problem is probably in ../CountStrings.py, which is treating the strings as
# regexs, so that special chars like "<" get funny treatment.
	$(BANNER)
#   First verbs:
	  $(ECHO) "------Verb Paradigm counts------" \
	> $@
	RXPREFIX="\\$(VERB_PARADIGM_SFM)\ " ; \
	RXSUFFIX="$$" ; \
	$(COUNTS)
#   ...a blank line:
	  $(ECHO) \
	>> $@
#   Then nouns;
	  $(ECHO) "------Noun Paradigm counts------" \
	>> $@
	RXPREFIX="\\$(NOUN_SUBCLASS_SFM)\ " ; \
	RXSUFFIX="$$" ; \
	$(COUNTS)

LoadRoots.xfst: $(XML_LEX) NahuatlCodes.py
#Extract the lex entries for the roots from the XML version of the lexicon.
# One lexc file is created in a subdir for each inflected POS, plus one
# lexc file for all uninflected POSs together, plus the target file containing
# the xfst code to load those lexc files.  We also produce a summary count
# of all lexical errors.
# The data is extracted from an XML/ UTF-8 version of the dictionary.
	@$(BANNER)
#   Remove all the existing files in the lexc directory, rather than just
#   overwriting them, because if a category is removed from the list of inflected
#   categories, we don't want to leave its corresponding lexc file laying around:
	-rm lexc/*
#   Now create the new lexc files, and the target file.  Toolbox used to replace
#   the '<' and '>' of Jonathan's embedded XML tags with &lt; and &gt;, and this
#   is necessary in order for Python to interpret them for our purposes.  Toolbox
#   seems no longer to do this, so we use 'sed' to do it.  If Toolbox reverts to
#   replacing them, the 'sed' op will be a no-op (and therefore safe).
	cd lexc ; \
	  $(CAT) ../$(XML_LEX) \
	| $(SED) -e "s/<klamoa>/\&lt;klamoa\&gt;/" \
	         -e "s%</klamoa>%\&lt;/klamoa\&gt;%" \
	| $(PYTHON) -O ../NahTab2MultLexc.py -c ../NahuatlCodes.py -e LoadRootsErrors.txt \
	> ../$@
#   Output a summary of the lexicon warnings, sorted according to general type (done
#   by deleting the specifics of each msg, namely everything after a colon or single quote),
#   plus a count for each type:
	@$(MAGENTA); $(ECHO) "----------Summary of lexicon warnings follows-----------" > /dev/tty ; $(BLACK)
	@$(GREEN)
	@ $(CAT) lexc/LoadRootsErrors.txt \
	| $(GREP) Warning \
	| $(SED) -e"s/^ *Warning\: //" \
	         -e"s/[\:\x27].*$$//" \
	| $(SORT) \
	| $(UNIQ) -c \
	| $(TEE) lexc/LoadRootsSummary.txt
#         Send summary to both screen and file; latter is in RCS
	@$(MAGENTA); $(ECHO) "--------------------------------------------------------" > /dev/tty ; $(BLACK)

Stems.out: VerbStems.fsm
#Output the Nahuatl verb stems sorted by class.  The class is assumed to be the
# second item (-k 2) on the line, using '^' to separate items; the first item
# is an identifier of the stem type, like "SHORT:", some spaces, and the lexeme.
# This allows us to sort short and long stems of the same lexeme (actually, class) together.
# Finally, we use 'sed' to remove the class names, and then use sed's 'G' command to
# stick in a newline after each SHORT stem (the short stems for some reason wind up last).
# Can't use the -f flag to read in the file--have to use "-e source <fname>".
	@$(BANNER)
	 @$(XFST) -e "source VerbStems.xfst" -e "read regex LabeledVStem;" -e "print lower-words;" -stop \
	| $(SORT) -k 2 -t'^' \
	| $(SED) -e "s/\^C.*$$//" \
	| $(SED) '/SHORT/{G;};' \
	> $@

Debug.out: Debug.xfst $(GENERATOR)
#Run various diagnostics, including upper and lower alphabets, check for unmatched flag diacritics.
	@$(BANNER)
	 $(XFST) -e "source $<" -stop \
	> $@
#    Also check for any unmatched flag diacritics, which would appear inside a "<...>"
#    in the following (the leading "-" is because if there are no unmatched flags,
#    grep will signal an error).  We put the output in a temp file...
	- $(XFST) -e "load defined VerbInflection.fsm" \
	          -e "read regex UnderlyingFormUR;" \
	          -e "label net" \
	          -e "set show-flags ON" \
	          -e "print words" \
	          -stop \
	| $(GREP) "@" \
	| $(GREP) "<" \
	> $(TMP)/$@
#    ...then check whether we got anything.  If the above file is of size > 0,
#    then the two greps must have found s.t., namely a one-sided flag diacritic;
#    add this to the target file:
	@if test -s $(TMP)/$@ ; then \
		$(ECHO) >> $@ ; \
		$(ECHO) "Warning: One-sided flag diacritics:" >> $@ ; \
		cat $(TMP)/$@ >> $@ ; \
		$(ERROR) "Warning: One-sided flag diacritics, see $@ for details." ; \
	else \
		$(ECHO) "One-sided flag diacritic test passed successfully." >> $@ ; \
	fi

LowerAlphabet.txt: $(GENERATOR)
#Create a list of the symbols in the lower side alphabet, not counting
# the diacritic flags.  NB: There is a space char that shows up inside
# double quotes.  The space gets turned into a newline, leaving the two
# double quotes, which we grep out to avoid confusion.
	  $(XFST) -e "loadd $<" \
	          -e "regex GlSR .l ;" \
			  -e "print labels" \
			  -stop \
	| $(GREP) -v "Size" \
	| $(TR) " " "\n" \
	| $(GREP) -v "@" \
	| $(GREP) -v '"' \
	> $@
	-$(RCSDIFF) /cygdrive/c/RCS/C/Data/Data/Languages/Nahuatl/$@,v $@

UpperAlphabet.txt: $(GENERATOR)
#Create a list of the symbols in the upper side alphabet, not counting
# the diacritic flags.  NB: There is a space char that shows up inside
# double quotes.  The space gets turned into a newline, leaving the two
# double quotes, which we grep out to avoid confusion.
	  $(XFST) -e "loadd $<" \
	          -e "regex GlSR .u ;" \
			  -e "print labels" \
			  -stop \
	| $(GREP) -v "Size" \
	| $(TR) " " "\n" \
	| $(GREP) -v "@" \
	| $(GREP) -v '"' \
	> $@
	-$(RCSDIFF) /cygdrive/c/RCS/C/Data/Data/Languages/Nahuatl/$@,v $@

GlossTable.txt: UpperAlphabet.txt ExpandGlosses.lkp
#Create a table of all the affixal glosses, and add a second column with
# the gloss abbreviations expanded.  Used for the DOE tool.
# Unclear why 'lookup' puts tabs between leading dashes and rest of output,
# so we use 'sed' to remove these (and 'tr' to remove extraneous space chars,
# and 'grep' to remove the blank lines 'lookup' inserts between records).
	  $(CAT) $< \
	| $(TR) " " "\n" \
	| $(GREP) "[\=\+\-]" \
	| $(LOOKUP) ExpandGlosses.lkp \
	| $(SED) -e"s/-	/-/" \
	| $(TR) -d " " \
	| $(GREP) -v "^$$" \
	1> $@ 2> /dev/null

#-------------------Test Suite----------------

TestSuite/VerbTable.out: TestSuite/VerbTable.in $(GENERATOR)
#Probably a temporary recipe, to generate the verb table for Jonathan's demo.
# Pass it throught dos2unix to get rid of the CRs that xfst introduces.
	  $(XFST) -f $< \
	| $(DOS2UNIX) \
	> $@

TestSuite/%.out: TestSuite/%.in $(PARSER)
#This set of test forms is based on the Word doc '2 Verb tenses.doc'.
# It may be superceded at some point by the XML paradigm generators below.
# Passing it through DOS2UNIX seems to be necessary, else xfst inserts CRLFs.
	$(BANNER)
	  $(XFST) -q \
	          -e "loadd $(PARSER)" \
	          -e "regex GlSR; " \
	          -e "source $<" \
			  -stop \
	| $(DOS2UNIX) \
	> $@

TestSuite/%.in: TestSuite/%.Defn.xml TestSuite/%.Stems.list TestSuite/EntityDefns.xml
# Validate the XML file listing the paradigm definitions, then
# use it to generate the xfst input file (.in) from the list of stems+POSs.
	$(BANNER)
#   Use xsltproc to pre-process the paradigm defn file to include some entities:
	-rm $(TMP)/$(notdir $<)
	  cd TestSuite \
	; $(XSLTPROC) $(NULL_STYLESHEET) $(notdir $<) > $(TMP)/$(notdir $<)
#   Make sure the result is a valid paradigm defn file:
	@$(ECHO) Validating paradigms...
	$(XMLLINT) --noout --noent --schema $(GEN_PARADIGMS_SCHEMA) $(TMP)/$(notdir $<)
#   If we get here, TestSuite/%.Defn.xml passed the validation;
#   use it (or rather, the copy in /tmp) to generate the xfst input file:
	$(GEN_PARADIGMS) -e 'UTF-8' -f $(basename $@).Stems.list -p $(TMP)/$(notdir $<) -r " .o. Lex" -o $@

TestSuite/%.xml: TestSuite/%.in $(PARSER)
#Tell the parser (not the generator) to generate the desired forms for
# the specified test suite input file.  The forms are output to an XML file,
# which contains three columns: Surface form--Underlying form--Gloss.
##FIX: Another kludge: the .GlURSR file lacks the square brackets around the root,
# so we use sed to remove the square brackets on the roots here (the square
# brackets are originally put there by GenParadigms.py; maybe we should just
# eliminate them there??)
	$(BANNER)
#    First create an XML file with just the gloss forms, as generated
#    from the "features".  The .in file that we "source" uses the upper side
#    of the 'Lex' variable.  Since there is some ambiguity going from the upper
#    (gloss) side to the lower (UR) side, because of identical glosses for the same
#    affixal morpheme ICO dialectal variation etc., we would get multiple but
#    identical gloss strings.  So we take the upper side of the Lex var, thereby
#    avoiding this unwanted ambiguity.
	  $(XFST) -e "loadd $(PARSER)" \
	          -e "regex Lex .u;" \
			  -e "define Lex" \
	          -e "source $<" \
	          -stop \
	| $(SED) -e "s/\[//" -e "s/\]//" \
	> $(TMP)/$(notdir $(basename $@)).xml
#    Now take the words that were generated, unique them...
	  $(CAT) $(TMP)/$(notdir $(basename $@)).xml \
	| $(FGREP) -v "<" \
	| $(SORT) -u \
	> $(TMP)/$(notdir $(basename $@)).words
#    ...and get all three forms (SR, UR, Gl):
	$(MAKE) $(TMP)/$(notdir $(basename $@)).GlURSR
#    Now merge these parses back into the XML file (the '-' arg on xsltproc means "read from stdin").
#    Kludge: Unclear why, but we get many duplicate lines if we don't run it through 'uniq'.  These
#    don't seem to correspond to a real ambiguity in the parser/ generator.
	  $(PYTHON) SR2Parses.py -u -x $(TMP)/$(notdir $(basename $@)).xml -p $(TMP)/$(notdir $(basename $@)).GlURSR \
	| $(XSLTPROC) SortParadigms.xslt - \
	| $(UNIQ) \
	> $@

TestSuite/%.html: TestSuite/%.xml
#Convert the XML version of a test suite output into an HTML file.
	$(BANNER)
	  $(XSLTPROC) ParadigmXML2HTML.xsl $< \
	> $@

TestAll: $(GENERATOR) $(PARSER)
#Test a representative set of data.
# The dependencies on $(GENERATOR) and $(PARSER) are strictly superfluous, because
# the individual files in $(TESTSUITE_OUT) have one or the other as dependencies.
# However, it's cleaner to put them as dependencies for this target, because this
# prevents 'make' from repeatedly trying to make them (once for each file in
# $(TESTSUITE_OUT)) if they cannot be made.
# NB: in order to create an html diff file of the .html output to send to Jonathan,
# do a ComponentSoftware RCS diff of the html file, then do 'File | Save as HTML'.
# Then edit the resulting html-diff file by changing all the '&lt;' to '<',
# and '&gt;' to '>'.  Can't automate(?), because there is no way to create the html-diff
# with the fancy yellow color coding and the pop-up navigation window in batch mode.
	for fname in $(TESTSUITE_OUT) ; do \
	   $(MAKE) $$fname ; \
	   done

NahTokenize.fsm: NahTokenize.xfst
#Don't use the Tokenize.fsm in the generic makefile.
	$(BANNER)
#  Convert to UTF-8:
	$(ICONV) -f ISO-8859-1 -t UTF-8 < $< > utf8/$(notdir $<)
#  ...then run xfst (the $(XFST) var includes the value of XFST_FLAGS, which tells
#  xfst to assume UTF-8 encoding):
	$(XFST) -e 'source utf8/$(notdir $<)' -stop

%.parse: %.txt SR2Gl.lkp NahTokenize.fsm
#Run the parser over an input text.
#WARNING: This uses ISO-8859-1.  If you need UTF-8, then first convert the
# .txt file to .utf8, so it can use the %.utf8-->%.parse recipe above.
# The Xerox tokenizer is used instead of the tr tokenizer, because it allows
# us to tokenize the clitic 'ma' together with the following word (which had
# better be a verb).  The output format of the target file is a series of records,
# each representing a non-punctuation token of the input, with records separated
# by a blank line; within a record, each line represents a parse (but if there
# were no parses, the single line of the record contains a '+?').
	$(BANNER)
#   Tokenize into two temp files, which we can look at for debugging,
#   word counts etc.  The first is a normalized version of the original
#   text, one lower-cased word per line, omitting punctuation:
	  $(CAT) $< \
	| $(NAH_TOKENIZER) \
	| $(LOWER_CASE) \
	> $(TMP)/$(notdir $(basename $@)).norm
#   ...and the second is a list of unique words in that normalized text:
	  $(CAT) $(TMP)/$(notdir $(basename $@)).norm \
	| $(SORT) -u \
	> $(TMP)/$(notdir $(basename $@)).words
#   Then parse both files.  The first one (original but normalized text,
#   then parsed) becomes the target file of this recipe:
	  $(CAT) $(TMP)/$(notdir $(basename $@)).norm \
	| $(LOOKUP) -utf8 -flags TT SR2Gl.lkp \
	> $@
#   ...and the second, containing the unique words and their parses
#   and failures, goes to another file:
	  $(CAT) $(TMP)/$(notdir $(basename $@)).words \
	| $(LOOKUP) -utf8 -flags TT SR2Gl.lkp \
	| $(GREP) -v "^[[:space:]]$$" \
	> $(basename $@).words.parse
#   Collect some statistics:
	@$(ECHO) Word counts:
	@$(ECHO) Unique words in text = `$(CAT) $(TMP)/$(notdir $(basename $@)).words | $(WC) -l`
	@$(ECHO) Unique words parsed = `$(CAT) $(basename $@).words.parse \
	                              | $(FGREP) -v "+?" \
	                              | $(CUT) -f 1 \
	                              | $(UNIQ) \
	                              | $(WC) -l`

%.parse.xml: %.txt $(PARSER) SR2UR.lkp UR2Gl.lkp NahTokenize.fsm
#Run the parser over an input text, and produce an XML interlinear version.
# The output format of the target file is an XML file, containing a sequence of records:
#    	<parserOutput>
#			<parses form=�foos�>
#				<parse morphemes=�foo+s� glosses=�house+PL�/>
#				<parse morphemes=�f+oo+s� glosses=�1SgS+run+PAST�/>
#		  </parses>
#			<parses form=�,�>
#				<parse morphemes=�,� glosses=�,� />
#			</parses>
#			<parses form=�baring�>
#			</parses>
#		</parserOutput>
	$(BANNER)
#   Tokenize into two temp files, which we can look at for debugging,
#   word counts etc.  The first is a tokenized and lower-cased version
#   of the original text, one lower-cased token per line:
##FIX: Need to deal with xml tags in input, including <spanish>foo</spanish>,
#      and use of ellipsis.
	  $(CAT) $< \
	| $(NAH_TOKENIZER) \
	| $(LOWER_CASE) \
	| $(TR) -d "\r" \
	> $(TMP)/$(notdir $(basename $@)).norm
#   ...and the second is a list of unique words in that normalized text:
	  $(CAT) $(TMP)/$(notdir $(basename $@)).norm \
	| $(SORT) -u \
	> $(TMP)/$(notdir $(basename $@)).words
#   ...and convert this into a single file, with tab-delimited
#   columns for Gl-UR-SR:
	$(MAKE) $(TMP)/$(notdir $(basename $@)).GlURSR
#   Now we're ready to read in the normalized text file, and output an XML file:
	  $(PYTHON) Text2Parses.py -p  $(TMP)/$(notdir $(basename $@)).GlURSR \
	> $@

%.GlURSR: %.words $(PARSER) Gl2UR.lkp UR2SR.lkp
#Convert a .words file containing a list of Nahuatl parses in their gloss form, one per line,
# into a .GlURSR file.
# The latter (output) file contains three tab delimited columns, with the Gloss parse (
# Gl) in the first column, the Underlying Representation (UR) in the second column, and
# the original Surface Representation (SR) in the third column.
# (Used by both the TestSuite files and for parsing a file of Nahuatl text.)
	$(BANNER)
#   First use the gloss strings in the .words file to generate their underlying representation.
#   We discard the error stream output of 'lookup', because this is just a string
#   saying that it has loaded the .lkp file; we also discard any words that didn't generate
#   (that shouldn't happen, but...).
	  $(CAT) $< \
	| $(LOOKUP) -utf8 -flags TT Gl2UR.lkp \
	2> /dev/null \
	| $(FGREP) -v "+?" \
	| $(TR) -d "\r" \
	| $(GREP) -v "^[[:space:]]*$$" \
	> $(TMP)/$(notdir $(basename $@)).Gl2UR
#   For the words that correctly generated, we'll also generate their surface form
#   from their underlying form.
#   We're generating from their UR, because the stage from Gl to UR is ambiguous (when?)
#   But generating from UR is also ambiguous for C-final nouns, because the -inal.possd.sg
#   and the -al.possd.sg are homophonous (they only differ on V-final nouns).  So we use
#   a kludge, and 'uniq' the result.
	  $(CAT) $(TMP)/$(notdir $(basename $@)).Gl2UR \
	| $(CUT) -f 2 \
	| $(LOOKUP) -utf8 -flags TT UR2SR.lkp \
	| $(GREP) -v "^[[:space:]]*$$" \
	| $(UNIQ) \
	> $(TMP)/$(notdir $(basename $@)).UR2SR
#   Now convert these two files into a single file, with tab-delimited
#   columns for Gl-UR-SR.  The merging is done by the UR, since one
#   gloss form may generate > one underlying representation, and we want
#   to get the right one:
	  $(PYTHON) MergeByUR.py -l $(TMP)/$(notdir $(basename $@)).Gl2UR -r $(TMP)/$(notdir $(basename $@)).UR2SR \
	> $@

#-------------Backup, Delivery------------------

backup.tar.gz: $(ZIP_FILES)
#Backup the source code files to a tar.gz file for xfer to a backup location.
	$(TAR) cfz $@ $^
	$(GREEN) ; $(ECHO) "Backing up to LDC..." ; $(BLACK)
	  $(ECHO) "put $@ /web/edu/upenn/ldc/nahuatl/xfst_dir/" \
	| $(FTP) maxwell@login.ldc.upenn.edu
	$(GREEN); $(ECHO) "Backing up to Balsas..." ;  $(BLACK)
	  $(ECHO) "put $@ /home/cluster1/data/o/d/1121861/html/maxwell" \
	| $(FTP) mmaxwell@www.balsas-nahuatl.org

ldc: $(NETWORK_FILES)
#Upload the parser, generator, and .lkp files to the LDC server.  This is complicated
# by the fact that all the 'sftp' cmds that operate on the file system (like rm, ls,
# put) take only a single arg.  So things have to be done with separate cmds, using
# loops.  We therefore build a file of commands (easier than echoing all the stuff
# in the command line):
	$(ECHO) "cd /web/edu/upenn/ldc/nahuatl/xfst_dir/" > $(TMP)/ftpcmds.txt
#   Then create a 'rm' cmd and a 'put' cmd for each file (if we don't first rm them,
#   'put' won't overwrite them):
	for fn in $(NETWORK_FILES) ; do \
	   $(ECHO) "rm $$fn"  >> $(TMP)/ftpcmds.txt ; \
	   $(ECHO) "put $$fn" >> $(TMP)/ftpcmds.txt ; \
	   done
#   Add two commands to list the results, to make sure it worked
#   (need separate cmds for .fsm and .lkp, because the ftp version of 'ls'
#   only takes a single arg):
	$(ECHO) "ls -l *.fsm" >> $(TMP)/ftpcmds.txt
	$(ECHO) "ls -l *.lkp" >> $(TMP)/ftpcmds.txt
#   ...and do it (can't use the -b arg on sftp, because then you have to use
#   non-interactive authorization, rather than a password):
	$(CAT) $(TMP)/ftpcmds.txt | $(FTP) maxwell@login.ldc.upenn.edu

balsas: $(PARSER) $(GENERATOR)
#Upload the parser and generator to Balsas (= the http site).  When the xfer
# is done, we do an 'ls' to verify that the files got there.
# The '\n' in the commands below are newlines, not part of directory names!
	ls -l *.fsm
	  $(ECHO) -e "cd html/programmers\nput $(PARSER)\nput $(GENERATOR)\nls -l *.fsm" \
	| $(FTP) mmaxwell@ftp.balsas-nahuatl.org

DOE.tar.gz: $(PARSER)
#Create the deliverable for the DOE grant as a tar.gz file, and send it to the LDC.
# Includes:
#    The compiled parser
#    The parser (but not generator) versions of the Oapan xfst files (like
#         the source .xfst files, but gpp has already been run to remove generator
#         versions of rules and any non-Oapan stuff)
#    The UTF-8 XML (not SFM) lexicon
#    The Python files to convert the UTF-8 lexicon into the *.lexc files
#    The *.lexc files
# First clear out anything in the DOE dir:
	-rm -r DOE
	mkdir DOE
# ...then start populating it:
#   Unicode XML lexicon: As of 27 June 2009, we do not send this, because Jonathan's
#   XML dictionary is adequate (and correctly converts the '<' and '>' of embedded
#   XML tags to '&lt;' and '&gt;', which for some reason Toolbox does not).
##	cp Dictionary.xml DOE/
#   Oapan parser source code:
	mkdir DOE/XfstFiles
	cp ParserFSMs/*.xfst DOE/XfstFiles
#   Python converter files:
	cp NahTab2MultLexc.py NahuatlCodes.py DOE/
#   Lexc files are created from the XML lexicon, so we only create their directory:
	mkdir DOE/lexc
#   Shell scripts (and Dave Graff's Perl script) to create the executable and parse:
	cp CompileParser.sh ParseText.sh ParseText.pl DOE
#   xfst and lookup networks:
	xfst -utf8 -q -e 'load defined Parser.fsm' -e 'read regex URSR;' \
      -e 'save stack UR2SR.lkp' -stop
	xfst -utf8 -q -e 'load defined Parser.fsm' -e 'read regex GlBrUR;' \
      -e 'save stack GlBr2UR.lkp' -stop
	cp $(PARSER) UR2SR.lkp GlBr2UR.lkp DOE
#   the Python file for merging the output of the two .lkp networks:
	cp MergeByUR.py DOE
#   ...and a list of glosses (taken from Jonathan's '00 Glosses.doc' Word file):
	cp Glosses.txt DOE
# Tar and gzip this directory, and send it to the LDC:
	$(TAR) cfz $@ DOE
	$(GREEN) ; $(ECHO) "Sending DOE deliverable to LDC..." ; $(BLACK)
	  $(ECHO) "put $@ /web/edu/upenn/ldc/nahuatl/xfst_dir/" \
	| $(FTP) maxwell@login.ldc.upenn.edu

report:
#Create a report from the RCS log of all hand-produced files and their change comments
# during the specified time period.  Unfortunately, it's not possible to limit
# the report to those files that have actually been altered during that time.
	  $(ECHO) -n "Input day before first date for report (in format YYYY/MM/DD): " \
	; read START_DATE \
	; $(ECHO) -n "Input day after last date for report (in format YYYY/MM/DD): " \
	; read END_DATE \
	; $(RLOG) -d$$START_DATE'"<"'$$END_DATE $(CUR_RCS_DIR)/*.xfst,v \
	                                        $(CUR_RCS_DIR)/*.py,v \
											$(CUR_RCS_DIR)/makefile,v \
											$(CUR_RCS_DIR)/TestSuite/* \
	> Report.txt

NahuatlGrammar.tar.gz: $(LEXC_FILES) $(XFST_FILES)
#Just the xfst and lexc files; does not include makefiles.
	$(TAR) czf $@ $^

endif #ifndef NAHUATL_MAKEFILE