Tryag File Manager

//proc/self/root/home/cluster1/data/bu01/1121861/logs/mmaxwell/makefile1

#Generic language makefile

ifndef LANG_MAKEFILE		#prevent infinite recursive inclusions or too many open files
LANG_MAKEFILE = Data/Languages/makefile

#-----------------Read Me--------------------
#This file defines many of the generic language tests, e.g. for comparing the output
# of two parsers in the same language.  In order to use it, it should be included by
# a language-particular makefile, which should define the following variables
#   LANG_CODE: used by OS-specific localization program to choose character set, etc.
#   LANGUAGE: name of the language, usually in mixed case; used in msgs and for
#             language-specific directory names
#   LANGUAGE_UC: name of the language, in all upper case.  Used in one case
#             (the Surprise Language dir) for the language-specific subdir.
# The language-specific makefile may also define the ff. vars:
#   LEXICON_FILES: Defines one or more lexicon files, assumed to be in SFM format.
#             They may use	either CR/LF or LF newlines.

#-----------------Definitions---------------------
#Get defns from parent makefile:
ifdef THIS_DIR
	THIS_DIR:=$(THIS_DIR)/..
else
	THIS_DIR:=$(CURDIR)
endif

include $(THIS_DIR)/../makefile

#------------General variables:
LANGUAGES_DIR = /home/maxwell/Data/Languages
   #The dir that this makefile is in
#TOKENIZE = $(TR) -d '[:punct:]' | $(AWK) '{for (i=1;i<=NF;i++) printf("%s\nXMLTAG\n", $$i)}'
TOKENIZE = $(TR) -d '[:punct:]' | $(TR) -s '[:space:]' '\n'
#   Simple tokenizer.  The default is to use 'tr' to remove punctuation
#   (punctuation chars are defined in the language-specific makefile) and again to put
#   each word on a separate line.  (Alternatively, 'awk' can insert XML tags.)  Can be
#   re-defined in language-specific (or even parser-specific) makefiles.
XTOKENIZE_FSM = Tokenize.fsm
#   Slightly more sophisticated tokenizer, based on the one in the Karttunen and Beesley book.
#   Run with Xerox 'tokenize' util (= $(XTOKENIZER), defined in the OS-specific makefiles).

#------------Lexicon variables:
#See also SFM variables
LOCAL_LEX = NormalizeLex.db
#   Used for the normalized version of the lexicon (see the defn for NORMALIZE_SFMS).
#   This should have a dependency in the language-specific makefile on whatever
#   source this file is taken from, so it gets updated when necessary.

#------------Parsing variables:
# The following variables should be defined in the Language-and parser-particular makefiles
# (although defaults are provided for some); see the individual vars below for further
# info.
# Some have defaults; the rest have the value '<VAR> is undefined', so a later test can pick
# them up.  (I tried using the various 'make' functions, but couldn't get both the name and
# the value.)
# Note that all these definitions use ":=", so that they do not override any
# definitions provided above.
DIR1 :=            DIR1 is undefined
#   When doing comparisions, the dir where one of the transducers' makefile is located.
#   Defined in the language-specific makefile.
DIR2 :=            DIR2 is undefined
#   When doing comparisions, the dir where the other transducer's makefile is located.
#   Defined in the language-specific makefile.
EXTRACT_TEXT :=    $(CAT)
#   Any language/text-specific commands for extracting the language data from the input
#   file (e.g. stripping HTML tags).  Default is supplied below, can be re-defined in
#   the language-specific makefile.
#FULL_CONVERTER1
#   When doing comparisions, the transducer which makes the output of transducer #1
#   look maximally like that of transducer #2.  Defined in the language-specific makefile.
#FULL_CONVERTER2
#   When doing comparisions, the transducer which makes the output of transducer #2
#   look maximally like that of transducer #1.  (May need to be used in conjunction
#   with or in place of FULL_CONVERTER1.)  Defined in the language-specific makefile.
#INPUT_FIELD
#	The number (1-based) of the field in the transducer's output in which the input
#	word is echoed.  The parse is assumed to be in any (and all) fields following
#	this field.  Fields are assumed to be tab-separated, although we could add a
#	variable (and a parameter for awk) if this is not the case.  Defined in the language/
#   parser-specific makefile.
#NO_SOLN_STRING
#	The transducers output for failed parses an awk regular expression, so special
#	chars need to be triply escaped (e.g. \\\?).  Defined in the language/ parser-
#   specific makefile.
#PARSER
#	The command for invoking the transducer. Defined in the language/ parser-
#   specific makefile.
#POS_SFM
#  The SFM (without backslashes) which marks part of speech of the lex entry.
#  The value should be defined in the language-specific makefile, but it can also
#  be overridden (or set) on the command line:
#  export POS_SFM=infv ; make -e ...
PREPROCESS :=      $(CAT)
#	Any transducer-specific commands that must be done prior to passing the contents
#	of the file to be parsed on to the transducer (e.g. conversion to lower case,
#   or putting each word on a separate line).
#   Can be re-defined in the language/ parser-specific makefile.
PROJECT = $(PYTHON) $(MYLANGDIR)/Project.py
# Project certain fields into new tab-delimited records

SORTFORMAT :=      SORTFORMAT is undefined
#	Defined below, but it might be necessary to provide an alternative for some parsers.
#STD_CONVERTER1
#   When doing comparisions, the transducer which makes the output of transducer #1
#   look somewhat like that of transducer #2.  Defined in the language-specific makefile.
#STD_CONVERTER2
#   When doing comparisions, the transducer which makes the output of transducer #2
#   look somewhat like that of transducer #1.  Defined in the language-specific makefile.
#TST_DATA
#   Location of some language data to parse.  Defined in the language-specific makefile.

DEFS = "$(DIR1)" "$(DIR2)" "$(EXTRACT_TEXT)" "$(FULL_CONVERTER1)" "$(FULL_CONVERTER2)" \
       "$(INPUT_FIELD)" "$(LANG_CODE)" "$(NO_SOLN_STRING)" "$(PARSER)" "$(PREPROCESS)" \
       "$(SORTFORMAT)" "$(STD_CONVERTER1)" "$(STD_CONVERTER2)" "$(TOKENIZE)" \
       "$(TST_DATA)"

#-------------Definitions for parsing
NULL_FSM = $(MYLANGDIR)/null.fsm
export SORTFORMAT = $(MYLANGDIR)/SortFormat.awk

#Define common steps in using a parser.  Some of this is common to all parsers (or at
# least has a default), some is common to a particular language, and some is parser-specific.
# The ff. parsing steps are shared for many of the recipes below:
PARSE_STEPS = $(CAT) $(TST_DATA) \
		| $(PREPROCESS) \
		| $(EXTRACT_TEXT) \
		| $(TOKENIZE) \
		| $(PARSER)

#-------------SFM variables:

#Normalization of SFM files.  Normalization consists of ensuring that all fields occupy
# a single line (but they may still be empty), that fields within a record are separated
# by a single newline, and that records are separated by two newlines (i.e. by a blank
# line).  In addition, we concatenate multiple files together into one file in the local
# directory.  (We can't put them in /tmp, because different lexicons would have the same
# filename.)
# The caller must define the vars IN_REC_SEP and LEXICON_FILES, and the recipe to call
# this (which should have as its prerequisite the user's set of lexicon files).
# That recipe should also put the results into a file $(LOCAL_LEX).  We would do that
# here, but the recipe may need to do other things (e.g. normalize inconsistent use
# of SFMs).
# This could be a recipe, but 'make' doesn't seem to understand the use of variables
# that are defined in another makefile as the prerequisites.
# Steps:
# (1) Cat all the files
# (2) Turn each tab char, DOS-style carriage return, and space into a single
#     space char.
# (3)  it into a one record per line
# (4) ...then translate it back into a one field per line format.
#     This has the side effect of converting any multi-line fields into
#     single-line fields.  (It could be done more efficiently by eliminating
#     any newlines not immediately followed by an SFM, but is less easy to do
#     using standard Unix tools.)
NORMALIZE_SFMS = \
	REC_SEP=$(IN_REC_SEP) ; \
	  $(CAT) $(LEXICON_FILES) \
	| $(TR) -s "\t\r " " " \
	| $(ONE_REC_PER_LINE) \
	| $(ONE_FLD_PER_LINE)

#Define common steps to extract the contents of the SFM fields specified by the value
# of the var FIELDS.  Setting this var (and the var LOCAL_LEX) is the responsibility
# of the caller.  The SFMs should NOT include the backslash.  If there is more than
# one SFM in FIELDS, the list must be quoted.
# Steps:
# (1) Grep out all the lines containing the desired SFM (with added backslash)
# (2) Convert any sequences of space chars to a single space char
#     (so we can eliminate any leading spaces in the next step)
# (2) Select everything past the first space char (i.e. omit the SFM)
# (3) Trim off any leading and trailing white space.
# (4) Remove any empty lines
# ASSUMPTION: normalization (NORMALIZE_SFMS) has been done first, so that fields
# occupy a single line.
# We do the above for each field, collecting them into a temp file.  When we're done
# we cat out that file.
EXTRACT_FIELDS = \
	$(ECHO) > $(TMP)/FieldData.tmp ; \
	for SFM0 in $$FIELDS ; do \
		SFM="^\\\\$$SFM0 " ; \
		  $(CAT) $(LOCAL_LEX) \
		| $(GREP) "$$SFM" \
		| $(TR) -s " " " " \
		| $(CUT) -d" " -f2- -s \
		| $(SED) -e"s/ *$$//" -e"s/^ *//" \
		| $(GREP) -v "^$$" \
		>> $(TMP)/FieldData.tmp ; \
		done ; \
	$(CAT) $(TMP)/FieldData.tmp

#Collect all the words in some field, and tokenize them on white space.
# Useful e.g. for submitting to a spell checker.  The caller must define
# the var 'FIELDS', which contains the SFM (without any backslash).
TOKENIZE_FIELD = \
	  $(EXTRACT_FIELDS) \
	| $(TR) -s "[:blank:][:punct:]" "\n" \
	| $(SORT) -u \
	> $S

#Define common steps in doing counts.  We count the number of occurences of strings
# (defined as lines in the prerequisite file(s)) in the lexicon.
# 'grep -c' will give a count of the number of times a regex appears in a file, and
# 'grep -f' will use the strings in a file as the regexs; but it does not appear
# possible to use run grep over an input once and derive the count for _each_ regex.
# Instead, we use a special Python program.  The regex's are taken from the lines
# of the prerequisite file, but with $RXPREFIX to the left and $RXSUFFIX to the right.
# (Both default to empty strings.)  The lines to be counted are taken from the lexicon
# ($(LOCAL_LEX)).
# NOTE: the counts are _appended_ to whatever exists in the file before.
COUNT_STRINGS = $(PYTHON) $(MYLANGDIR)/CountStrings.py
COUNTS = \
	$(CAT) $(LOCAL_LEX) \
	| $(COUNT_STRINGS) -f $^ -l "$$RXPREFIX" -r "$$RXSUFFIX" \
	>> $@

#Turn a stream of fields encoded in SFMs (each sfm on a separate line, although
# a field may extend beyond a single line) into a stream of records (one
# record per line, with tabs preceding every SFM except the first).
# This uses a Python script to put a single newline before each record separator SFM,
# and append all other SFMs after the immediately preceding record separator SFM,
# preceded by a tab char.
# I tried doing this with sed, and it was excruciatingly slow.
# Needs a pipe before, and a "\" after the newline (or a pipe, if the next
# cmd is on the same line).
# Stable, in the sense that if the input is already in a one record per line format,
# nothing will change.
# Assumptions:
# (1) Input uses Unix-style newlines
# (2) There are no tab chars in the input.
ONE_REC_PER_LINE = \
	 $(PYTHON) $(MYLANGDIR)/OneRecPerLine.py -r $$REC_SEP

#Turn a stream of records encoded in SFMs (one record per line, with tab chars
# preceding all but the first SFM) into a stream of fields, one field per line
# (and no fields extending over more than one line), with two NLs (Unix-style)
# separating each record from the preceding one (and possibly at TOF).
# Steps:
# (1) Put an (additional) NL after each line, to serve as a record separator
#     We could do this with 'ped', because 'ped' can directly introduce newlines
#     (which 'sed' cannot).  But 'ped' messes up on certain character encodings,
#     so instead we introduce a tab character at each EOLN (we have to use a literal
#     tab, because not all versions of 'sed' understand "\t").
# (2) Convert all tabs into NLs.
# Assumptions:
# (1) All tab chars are immediately before an SFM.  (We could use sed
#     instead of tr to ensure this, but given that these strucs come from
#     the above ONE_REC_PER_LINE process, that should be safe.)
ONE_FLD_PER_LINE = \
	   $(SED) -e "s/$$/	/" \
	 | $(TR) "\t" "\n"

# Modify any fields as required for the reversal.
# This may involve re-naming SFMs.  Two cases where this may be necessary are:
# (1) To match the fields in some already reversed lexicon)
# (2) To convert sub-senses into ordinary senses, thereby flattening the structure:
#             \sense building
#             \subsense house
#             \subsense outhouse
#         would become
#             \sense building
#             \sense house
#             \sense outhouse
# Any required steps are defined in the language-specific makefile;
# by default, this is just a 'cat':
MOD_FIELDS4REVERSAL = $(CAT)

#Do any required post-processing.  By default this is a no-op, but it can be modified
# on a language-particular basis:
POSTPROCESS_LEX = $(CAT)

#Delete any leading sense numbers.  If this is to be done, it should be done
# in the language-specific makefile; by default this step is just a 'cat':
DELETE_SENSE_NUMBERS = $(CAT)

#Make a given SFM (specified by a command line parameter) and its field be
# the first SFM in the record.  If more than one of the specified SFMs appear
# in the record, the record will be split into multiple records, one for each
# instance of the SFM:
#       \w na
#       \g (1) house (2) building
#       \ex Yax bo'on ta tut na.
# will become
#       \g house
#       \w na
#       \ex Yax bo'on ta tut na.
#
#       \g building
#       \w na
#       \ex Yax bo'on ta tut na.
# If sub-senses exist, the should have been turned into ordinary senses in an earlier stage.
# Caller must define OUT_REC_SEP.
SET_FIRST_SFM = $(PYTHON) $(MYLANGDIR)/SetFirstSFM.py -r $(OUT_REC_SEP) -w Reversal.warn

#Filter out any records whose headword contains a non-ASCII alphabetic char.
RM_NONASCII_HEADWORDS = $(PYTHON) $(MYLANGDIR)/RmNonASCIIHeadWords.py

#Chop the input into chunks, based on the first few letters of the headword.
# Need to specify in command line the -r flag for record-initial SFM, and the -s flag for
# the field containing the head word.
CHOPDICT = $(PYTHON) $(MYLANGDIR)/ChopDict.py

#-------------------------Paradigms:

#-----------------Recipes---------------------
%.token: %.txt
#Produce a sorted list of non-numeric tokens from a text file in a simple-minded way.
# The alphabetic chars are lower-cased (using whatever encoding happens to be set).
	$(TR) -cs '[:alpha:]' '\n' < $< \
	| $(TR) '[:upper:]' '[:lower:]' \
	| $(SORT) -u \
	| $(GREP) -v '^[[:digit:]+]$$' \
	> $@

SFMPatterns.txt: $(LOCAL_LEX)
#Create a list of unique SFM patterns within records.
	REC_SEP=$(IN_REC_SEP) ; \
	  $(CAT) $< \
	| $(ONE_FLD_PER_LINE) \
	| $(CUT) -d" " -f1 \
	| $(ONE_REC_PER_LINE) \
	| $(SORT) -u \
	> $@

%.VerifySFMs: % $(RX_FILE)
#Test the conformance of the SFM file % to the regular expr describing the licit record
# structure, as given in the file %.rx.  The result is output to the file %.VerifySFMs.
# This is a copy of the original file, but with error msgs inserted; it can be read into
# Shoebox and edited in place of the original SFM file.
	$(BANNER)
	$(VERIFY_SFMS) -r $(RX_FILE) -s $< \ > $@

$(XTOKENIZE_FSM): $(MYSRCDIR)/xfst/Tokenize.xfst
#Beesley and Kartunnen tokenizer (see definition for XTOKENIZE_FSM, above).
	cd $(dir $<); \
	$(XFST) -e "source $(notdir $<)" -stop
#  Do NOT continue above line onto following, so the resulting file gets copied to the correct dir:
	cp $(dir $<)/$@ .

#--------------------Morphological parsing stuff--------------------
# -------------Comparisons between parsers:
# 'cfraw', 'cfstd', and 'cffull' are intended to be called by the user; '_internal_compare' is not.
# These recipes assume the following definitions:
#		NULL_FSM: Don't do any conversion of tags; defined above.
#		STD_CONVERTER{1, 2}:	A "standard" converter to be tested (to convert from parser-specific tags
#			to standardized tags); defined in the calling makefile, normally the language-specific makefile.
#		FULL_CONVERTER{1, 2}: A "full" converter to be tested (to convert parser-specific tag notations
#			to a more standardized notation, doing more standardization than STD_CONVERTER); defined in
#			the calling makefile, normally the language-specific makefile.
#		DIR{1, 2}: The subdirs (relative to the calling makefile, usually the language-specific dir)
#			where the parser-specific converters are located.  Needed so we can cd to that directory and
#			use its makefile.  Also, we use the dir name as a filename for the output file, so that it appears
#			as a mnemonic in tkdiff's labels for the two files.

cfraw:
#Run '_internal_compare' (below) without any conversion of parser tags to common format.
	@$(MAKE) -sC .. -f makefile $(NULL_FSM)
	@$(MAKE) -s _internal_compare CONVERTER1=$(NULL_FSM) CONVERTER2=$(NULL_FSM)

cfstd:
#Run '_internal_compare' doing conversion of output formats to a standard format.
# Have to make Xelda2Common.fsm and LDC2Common.fsm; can't simply list these as dependencies
# here, because their dependencies are defined in the subdir's makefile.
	@$(MAKE) -sC $(DIR1) -f makefile $(STD_CONVERTER1)
	@$(MAKE) -sC $(DIR2) -f makefile $(STD_CONVERTER2)
	@$(MAKE) -s _internal_compare CONVERTER1=$(STD_CONVERTER1) CONVERTER2=$(STD_CONVERTER2)

cffull:
#Run '_internal_compare' converting tags to common tags (e.g. 'mf' to 'masc' and 'fem').
# See comment under 'cfstd' recipe re following calls to 'make'.
	@$(MAKE) -sC $(DIR1) -f makefile $(FULL_CONVERTER1)
	@$(MAKE) -sC $(DIR2) -f makefile $(FULL_CONVERTER2)
	@$(MAKE) -s _internal_compare CONVERTER1=$(FULL_CONVERTER1) CONVERTER2=$(FULL_CONVERTER2)

_internal_compare:
#Run two transducers on the same data, and diff the results.
# Generally called from 'cfraw' etc., which set the comparision levels.
# The default is to do no conversion (equivalent to 'cfraw').
# CONVERTER1 and CONVERTER2 are defined in the recipe that calls this recipe (i.e. cfraw, cfstd, or cffull)
# DIR1 and DIR2 are defined in another makefile (see above).
	#Sanity tests:
#	@if [ ! $(TST_LANGUAGE)'' ] ; then $(ERROR) "TST_LANGUAGE must be defined" > /dev/tty ; exit 1 ; fi
# Above line is taken care of by make env var 'MAKEFLAGS=--warn-undefined-variables'
#	@if [ ! -d ../$(TST_LANGUAGE) ] ; then $(ERROR) "'"$(TST_LANGUAGE)"'" "must be a subdir of "../$(CURDIR) > /dev/tty ; exit 1 ; fi
#		#What we really want is to ensure that the current dir ends in TST_LANGUAGE, but that's rather hard...
	@if [ ! $(CONVERTER1)'' ]; then $(ERROR) "CONVERTER1 must be defined" > /dev/tty ; exit 1 ; fi
		#This can happen if the user called this directly, without defining the CONVERTER1 var.
		#The var is defined by the recipes 'cfraw', 'cfstd', and 'cffull'.
	@if [ ! $(CONVERTER2)'' ] ; then $(ERROR) "CONVERTER2 must be defined" > /dev/tty ; exit 1 ; fi
		#See comments for previous cmd.
	#Run the tests:
	$(LOCALIZE); cd $(DIR1); \
	    $(MAKE) --no-print-directory -s SortFormat \
		| $(EXPAND) \
		| $(LOOKUP) $(CONVERTER1) -flags xmbTT \
		| $(GREP) -v "^ *$$" \
		| $(SORT) -u \
		1> $(TMP)/$(DIR1).out
#		"-s" flag on make means silent (so we don't get comments from 'make' which result in extraneous diffs)
#		Likewise the "--no-print-directory" flag (otherwise the directory changes show up in the output; can't
#		seem to change this by only piping stdout).
#		'expand' converts tabs to spaces, since LOOKUP ignores anything to the right of a tab char.
#		The call to 'lookup' converts the parser's output to a std format.
#		Flags on 'lookup':
#			'x': don't copy input to output
#			'mb': assume multi-char symbols on both upper and lower sides (namely, the tags)
#			'TT': don't insert anything between "lemma" and "tags" (otherwise we get tab separator
#					before the first thing it thinks is a tag, even splitting "comment" into "com ment")
#		'grep' removes blank lines introduced by 'lookup'
#			(the "$" for "EOLN" needs to be doubled so 'make' won't interpret it as a var)
#		'sort' puts the canonical parses in a canonical order (note that the first column is already
#			sorted by the SortFormat, but the second column may have become unsorted by virtue of being
#			converted to canonical form)
# 		"1>" means redirect only stdout, not error msgs
	$(LOCALIZE); cd $(DIR2); \
		$(MAKE) --no-print-directory -s SortFormat \
		| $(EXPAND) \
		| $(LOOKUP) $(CONVERTER2) -flags xmbTT \
		| $(GREP) -v "^ *$$" \
		| $(SORT) -u \
		1> $(TMP)/$(DIR2).out
# 		See above for various command line flags, etc.
#  Finally, compare results:
	-$(VISUALDIFF) -iw $(TMP)/$(DIR1).out $(TMP)/$(DIR2).out &
#		The leading "-" on this cmd allows us to ignore (sort of) the exit status of 'diff',
# 		which will be 1 if there were differences.  The "-i" flag means ignore case diffs
#		(the LDC transducer wants lower-cased words, so we lower-case words going into it,
#		while xfst doesn't care, so we don't bother).  The "w" flag means ignore any whitespace diffs.
#		Since vdiff is a separate app, we launch it with an "&", to allow continued use of the 'terminal'.
#		If vanilla 'diff' is used instead of a visual diff, pipe its output into s.t. like
#			$(SED) "s/^< /X: /; s/^> /L: /;"
#		to convert the ">" and "<" labels in the output of diff into labels (e.g. X for xfst, L for LDC).

# -------------Parse free text
XMLFormat: Definitions
#Dependencies:
# (1) Definitions: checks that the vars which must be defined are indeed defined
# (2) $(PARSER): builds the parser, if necessary.
#     THIS HAS BEEN COMMENTED OUT because it causes a 'multiple target patterns'
#     error in 'make' (probably because $(PARSER) may include e.g. 'lookup' and
#     cmd-line parameters
#Steps:
# (1) Preprocessing, e.g. for the LDC Spanish xducer, we convert upper case to lower.
#		If nothing is needed, 'cat' is essentially a no-op.
# (2) Get rid of the first three fields (offsets into audio, and speaker identification)
# 		and bracket each word with 'XMLTAG' on a separate line. (We will later turn 'XMLTAG'
#		into real XML tags; if we tried to insert the XML tags at this point, 'tr' would
#		strip the '<' and '>'.).
# (3) Get rid of punctuation.	 Ideally, we would do this with a command like
#	`		$(TR) -d "[:punct:]"
#		However, I can't get 'localize' to work correctly, so for the time being, I'm defining
#		the punctuation characters individually for each language using a var PUNCT.
# (4) Parse.  The transducer sends the line numbers to stderr, which we redirect to the bit-bucket.
# 		(They cannot be relied on to stay in sync on the screen, and redirecting both to a file
# 		seems to be problematic.)
# (5) Convert 'XMLTAG' into real XML tags.  (At present, we just put <word> tags around the words,
#		but there's room for further development, if we design a standard XML format for interlinear
#		text...)
# The output is in a very simplified XML format.
	$(PARSE_STEPS) \
		| $(AWK) '{if (/XMLTAG/) {printf("<\\word>\n<word>\n")} else print}' \
		| $(MORE)

ArabFormat: SanityTests
#Output in "Arabic" format, so it can be used by Hubert's manual disambiguator tool.
# The output format is as follows (see the file ArabFormat.txt
# for a larger sample):
#
#	INPUT STRING: لبنان
#	LOOK-UP WORD: lbnAn
#     Comment:
#	* SOLUTION 1: (lubonAn) lubonAn/NOUN_PROP
#     (GLOSS):  + Lebanon +
#  SOLUTION 2: (libanAn) li/PREP+banAn/NOUN
#     (GLOSS): for/to + finger tips +
#  SOLUTION 3: (labanAn) la/EMPHATIC_PARTICLE+banAn/NOUN
#     (GLOSS): indeed/truly + finger tips +
# If the parser doesn't succeed, there are no "solutions". The '*' is added by
# human annotators for the "correct" parse, and is not created by the following.
	$(PARSE_STEPS) \
		| $(AWK) -f $(ARABFORMAT) -v InputField=$(INPUT_FIELD) -v NoSolutionString=$(NO_SOLN_STRING)

Definitions:
#Check that the vars which must be defined are indeed defined.
# The outer grep determines whether any vars are undefined (and causes an eventual exit
# from 'make'), while the inner loop (executed only if the outer grep detects a problem)
# tells exactly what definitions are missing.
	@if $(ECHO) $(DEFS) | $(GREP) undefined >> /dev/null ; then \
		for D in $(DEFS) ; do \
			if $(ECHO) $$D | $(GREP) undefined >> /dev/null  ; then \
				echo $$D ; \
			fi ; \
		done ; \
		exit 1 ; \
	fi

SortFormat: SanityTests
#Output in sorted format, to make for easy comparison between parsers.
# The output has all lines sorted together (and no duplicate lines-- the primary reason
# for sorting at this point is to eliminate dupes, to speed things up later;
# after we put the parses into canonical form, we'll do another sort).
# The output format is as follows:
#		<InputWord>\t<A Parse>
# Note that if a word parses ambiguously, the individual parses are on separate lines.
	$(GREEN); $(ECHO) PARSER=$(PARSER) > /dev/tty ; $(BLACK)
	$(PARSE_STEPS) \
		| $(GREP) -v "XMLTAG" \
		| $(AWK) -f $(SORTFORMAT) -v InputField=$(INPUT_FIELD) -v NoSolutionString=$(NO_SOLN_STRING) \
		| $(SORT) -u

SanityTests: Definitions
#Make sure the necessary variables have been defined to something that makes sense.
# (The prerequisite of 'Definitions' ensures that all the necessary vars have values;
# here we just ensure that some of those values make sense.)
# The redirection of the error msgs to /dev/tty is necessary in case output has been otherwise redirected.
#
	@if [ ! -f $(TST_DATA) ] ; then $(ERROR) "Cannot find test data file" $(TST_DATA) > /dev/tty ; exit 1 ; fi

#-------------------Dictionary recipes----------------------------------
#Based on SFM dictionaries.  Could instead be based on XML format

SFMs: $(LOCAL_LEX)
#Collect all the unique SFMs.  Although we tell 'cut' to cut at a space char, it
# seems to leave on some extraneous space chars.  So we use 'tr' to remove them.
	$(BANNER)
	  $(CAT) $(LOCAL_LEX) \
	| $(GREP) "^\\\." \
	| $(TR) -s " " " " \
	| $(CUT) -d" " -f1 \
	| $(SORT) -u \
	> SFMs

SFMCounts: SFMs
#Construct a list of unique SFMs, each followed by a (tab separated) count.
# The list is sorted from most common to least common.
# The rx prefix does not need the backslash char, because it's already in the SFMs.
	$(BANNER)
	$(ECHO) > $@  #Wipe any previous results
	RXPREFIX="^" ; \
	RXSUFFIX=" " ; \
	$(COUNTS)

POSs: $(LOCAL_LEX)
#Construct a list of unique POSs.  Ordinarily, this gives (and the following recipe counts)
# whatever field POS_SFM is set to in the language-particular makefile.  However, this can
# be overridden on the command line as follows:
#    export POS_SFM=infv ; make -e POSs
	$(BANNER)
	#Ensure POS_SFM is set:
	if test "$(POS_SFM)" = '' ; then \
		$(ERROR) "Variable POS_SFM must be set" ; \
		exit 1 ; \
	fi
	FIELDS=$(POS_SFM) ; \
	  $(EXTRACT_FIELDS) \
	| $(SORT) -u \
	| $(GREP) -vE "^[[:space:]]*$$" \
	> POSs

POSCounts: POSs
#Construct a list of unique POSs, each followed by a (tab separated) count.
# The list is sorted from most common to least common.
	$(BANNER)
	#Ensure POS_SFM is set:
	if test "$(POS_SFM)" = '' ; then \
	   $(ERROR) "Variable POS_SFM must be set" ; \
	   exit 1 ; \
	fi
	$(ECHO) > $@  #Wipe any previous results
	RXPREFIX="\\$(POS_SFM) " ; \
	RXSUFFIX="$$" ; \
	$(COUNTS)

Reversal.db: $(LOCAL_LEX)
#Reverse the lexicon.
# Input format:
#    Record separator = \id
#    Yoruba word = \w
#    English word = \d
# Output format:
#   Record separator = \ENGL
#    Yoruba word = \YORU
#    English word = \ENGL
# Steps:
# (1) Modify any fields as required for the reversal.
#     This may involve re-naming SFMs (perhaps to match the fields in some already
#     reversed lexicon).  The steps for this are defined in the language-specific
#     makefile, as the var MOD_FIELDS4REVERSAL.  By default, this is just a 'cat'.
# (2) Split multiple senses.  The steps for this are defined in the language-specific
#     makefile, as the var DELETE_SENSE_NUMBERS (q.v.).  By default, this is just a 'cat'.
# (3) Concatenate each record onto a single line, with a tab char before every SFM
#     except the first (using ONE_REC_PER_LINE).
# (4) Reversal: Put the sense (generally an English gloss) in first position on each line,
#     turning single records into multiple ones where there was > one gloss field (e.g.
#     where step (2) split multiple senses into multiple gloss fields).
# (5) Sort it (by the English now), folding case (so upper and lower case words sort
#     adjacent).  We do NOT sort uniquely; this would have the side effect of getting
#     rid of duplicate records, if there are any, but it may be better not to do that
#     (as a QC measure--although at present I don't have any such QC).
# (6) Perform any post-processing.  By default, POSTPROCESS_LEX is a no-op, but it can
#     be re-defined on a language-particular basis.
# (7) Un-concatenate fields, putting one newline between fields of a single
#     record, and two newlines between records.
	$(BANNER)
	 REC_SEP=$(IN_REC_SEP) ; \
	  $(CAT) $< \
	| $(MOD_FIELDS4REVERSAL) \
	| $(DELETE_SENSE_NUMBERS) \
	| $(ONE_REC_PER_LINE) \
	| $(SET_FIRST_SFM) \
	| $(SORT) -f \
	| $(POSTPROCESS_LEX) \
	| $(ONE_FLD_PER_LINE) \
	> $@

FrenchWords.txt: $(LOCAL_LEX)
#Extract the words from all the French fields, tokenize them, and sort them uniquely
# (for spell checking).  We currently take glosses from only a single field, but
# it might be desirable to loosen this restriction so we could include e.g.
# translations of example sentences.
#	FIELDS=$(FRENCH_GLOSS_FIELD) ;
	FIELDS=$(FRENCH_SFMs) ; \
	  $(EXTRACT_FIELDS) \
	| $(TR) -s "[:blank:][:punct:]" "\n" \
	| $(SORT) -u \
	> FrenchWords.txt

FrenchMisspellings.txt: FrenchWords.txt
#Run a French spell checker over the French words.
# WARNING: 'aspell' may not be installed on every system.
	$(BANNER)
	  $(SPELL) --lang=fr -l < $^ \
	> $@

EnglishWords.txt: $(LOCAL_LEX)
#Extract all the words from the French fields, tokenize them, and sort them uniquely
# (for spell checking).  See additional comments above re French words.
	FIELDS=$(ENGLISH_GLOSS_FIELD) ; \
	  $(EXTRACT_FIELDS) \
	| $(TR) -s "[:blank:][:punct:]" "\n" \
	| $(SORT) -u \
	> EnglishWords.txt

EnglishMisspellings.txt: EnglishWords.txt
#Run a English spell checker over the English words.
# WARNING: 'aspell' may not be installed on every system.
	$(BANNER)
	  $(SPELL) --lang=en -l < $^ \
	> $@

MissingXrefs.txt: $(LOCAL_LEX)
#Find any cross-references that don't exist.
# Requires two definitions:
#       XREF_ER_SFMS:   List of SFMs used for xrefs (without backslashes)
#                       NB: If there is more than one, they must be quoted
#       XREF_EE_SFM:   SFM for field to which xrefs refer (without the backslash)
# We assume all xrefs point to the same SFM (probably the citation form)
	$(BANNER)
#First get a list of all referees (things to which an xref can refer):
	  FIELDS=$(XREF_EE_SFM) ; \
	  $(EXTRACT_FIELDS) \
	| $(SORT) -u \
	> $(TMP)/xreferees.txt
#Then get a list of all referers:
	  FIELDS=$(XREF_ER_SFMS) ; \
	  $(EXTRACT_FIELDS) \
	|  $(SORT) -u \
	> $(TMP)/xreferers.txt
	$(DIFF) $(TMP)/xreferees.txt $(TMP)/xreferers.txt \
	| $(GREP) "> " \
	| $(SED) -e"s/^> //" \
	> $@

ReciprocalXrefs.txt: $(LOCAL_LEX)
#For each cross-ref which exists and which is supposed to be reciprocal,
# ensure that it exists in both directions.  Requires two definitions:
#       XREF_RECIP_SFM:    SFM used for reciprocal xrefs (without the backslash)
#       XREF_EE_SFM:       SFM for field to which xrefs refer (without the backslash)
# We assume all xrefs point to the same SFM (probably the citation form)
# Algorithm: extract all the xreferer-->xreferee pairs into two sorted tab-delimited lists,
# one as Referer\tReferee, the other as Referee\tReferer.  Do a diff between the two lists;
# any items in the second list but not in the first need to be added to the dictionary.
#  Create a file with the order REFERER - REFEREE:
	REC_SEP=$(IN_REC_SEP) ; \
	  $(CAT) $(LOCAL_LEX) \
	| $(ONE_REC_PER_LINE) \
	| $(PROJECT) -r $(XREF_RECIP_SFM) -o $(XREF_EE_SFM) \
	| $(SORT) \
	> $(TMP)/XRefs.sfm
#  Create a file with the reverse order (REFEREE - REFERER), but with the SFMs swapped.
#  First, the original XREF_RECIP_SFM fields:
	  $(CUT) -f1 < $(TMP)/XRefs.sfm \
	| $(SED) -e"s/^\\\\$(XREF_RECIP_SFM)/\\\\$(XREF_EE_SFM)/" \
	> $(TMP)/XRefs1.sfm
	  $(CUT) -f2 < $(TMP)/XRefs.sfm \
	| $(SED) -e"s/^\\\\$(XREF_EE_SFM)/\\\\$(XREF_RECIP_SFM)/" \
	> $(TMP)/XRefs2.sfm
	  $(PASTE) $(TMP)/XRefs2.sfm $(TMP)/XRefs1.sfm \
	| $(SORT) \
	> $(TMP)/RevXRefs.sfm
#  Ff. cmd must be preceded by a '-', because 'diff' sets exit status to non-zero
#  if there are any diffs:
	- $(DIFF) $(TMP)/XRefs.sfm $(TMP)/RevXRefs.sfm \
	| $(GREP) "^> " \
	| $(SED) -e"s/^> //" \
	> $(TMP)/MissingXRefs.sfm
#  ...But we want to reverse the order of fields in the latter file, so:
	$(CUT) -f1 < $(TMP)/MissingXRefs.sfm > $(TMP)/MissingXRefs1.sfm
	$(CUT) -f2 < $(TMP)/MissingXRefs.sfm > $(TMP)/MissingXRefs2.sfm
	$(PASTE) $(TMP)/MissingXRefs2.sfm $(TMP)/MissingXRefs1.sfm > $@

ValidateXMLLex: $(XML_DICT) $(XSD_FILE)
#Run the XML version of the dictionaries through XML validation:
	$(XMLLINT) --noout --schema $(XSD_FILE) $(XML_DICT)

install:
#Copy some results over to a user's directory.
#	-rm $(INSTALL_DIR)/*   DANGEROUS!
	cp $(OUTPUT_FILES) $(INSTALL_DIR)

endif	#ifndef LANG_MAKEFILE