Tryag File Manager
Home
-
Turbo Force
Current Path :
/
proc
/
self
/
root
/
home
/
cluster1
/
data
/
bu01
/
1121861
/
logs
/
mmaxwell
/
Upload File :
New :
File
Dir
//proc/self/root/home/cluster1/data/bu01/1121861/logs/mmaxwell/makefile1
#Generic language makefile ifndef LANG_MAKEFILE #prevent infinite recursive inclusions or too many open files LANG_MAKEFILE = Data/Languages/makefile #-----------------Read Me-------------------- #This file defines many of the generic language tests, e.g. for comparing the output # of two parsers in the same language. In order to use it, it should be included by # a language-particular makefile, which should define the following variables # LANG_CODE: used by OS-specific localization program to choose character set, etc. # LANGUAGE: name of the language, usually in mixed case; used in msgs and for # language-specific directory names # LANGUAGE_UC: name of the language, in all upper case. Used in one case # (the Surprise Language dir) for the language-specific subdir. # The language-specific makefile may also define the ff. vars: # LEXICON_FILES: Defines one or more lexicon files, assumed to be in SFM format. # They may use either CR/LF or LF newlines. #-----------------Definitions--------------------- #Get defns from parent makefile: ifdef THIS_DIR THIS_DIR:=$(THIS_DIR)/.. else THIS_DIR:=$(CURDIR) endif include $(THIS_DIR)/../makefile #------------General variables: LANGUAGES_DIR = /home/maxwell/Data/Languages #The dir that this makefile is in #TOKENIZE = $(TR) -d '[:punct:]' | $(AWK) '{for (i=1;i<=NF;i++) printf("%s\nXMLTAG\n", $$i)}' TOKENIZE = $(TR) -d '[:punct:]' | $(TR) -s '[:space:]' '\n' # Simple tokenizer. The default is to use 'tr' to remove punctuation # (punctuation chars are defined in the language-specific makefile) and again to put # each word on a separate line. (Alternatively, 'awk' can insert XML tags.) Can be # re-defined in language-specific (or even parser-specific) makefiles. XTOKENIZE_FSM = Tokenize.fsm # Slightly more sophisticated tokenizer, based on the one in the Karttunen and Beesley book. # Run with Xerox 'tokenize' util (= $(XTOKENIZER), defined in the OS-specific makefiles). #------------Lexicon variables: #See also SFM variables LOCAL_LEX = NormalizeLex.db # Used for the normalized version of the lexicon (see the defn for NORMALIZE_SFMS). # This should have a dependency in the language-specific makefile on whatever # source this file is taken from, so it gets updated when necessary. #------------Parsing variables: # The following variables should be defined in the Language-and parser-particular makefiles # (although defaults are provided for some); see the individual vars below for further # info. # Some have defaults; the rest have the value '<VAR> is undefined', so a later test can pick # them up. (I tried using the various 'make' functions, but couldn't get both the name and # the value.) # Note that all these definitions use ":=", so that they do not override any # definitions provided above. DIR1 := DIR1 is undefined # When doing comparisions, the dir where one of the transducers' makefile is located. # Defined in the language-specific makefile. DIR2 := DIR2 is undefined # When doing comparisions, the dir where the other transducer's makefile is located. # Defined in the language-specific makefile. EXTRACT_TEXT := $(CAT) # Any language/text-specific commands for extracting the language data from the input # file (e.g. stripping HTML tags). Default is supplied below, can be re-defined in # the language-specific makefile. #FULL_CONVERTER1 # When doing comparisions, the transducer which makes the output of transducer #1 # look maximally like that of transducer #2. Defined in the language-specific makefile. #FULL_CONVERTER2 # When doing comparisions, the transducer which makes the output of transducer #2 # look maximally like that of transducer #1. (May need to be used in conjunction # with or in place of FULL_CONVERTER1.) Defined in the language-specific makefile. #INPUT_FIELD # The number (1-based) of the field in the transducer's output in which the input # word is echoed. The parse is assumed to be in any (and all) fields following # this field. Fields are assumed to be tab-separated, although we could add a # variable (and a parameter for awk) if this is not the case. Defined in the language/ # parser-specific makefile. #NO_SOLN_STRING # The transducers output for failed parses an awk regular expression, so special # chars need to be triply escaped (e.g. \\\?). Defined in the language/ parser- # specific makefile. #PARSER # The command for invoking the transducer. Defined in the language/ parser- # specific makefile. #POS_SFM # The SFM (without backslashes) which marks part of speech of the lex entry. # The value should be defined in the language-specific makefile, but it can also # be overridden (or set) on the command line: # export POS_SFM=infv ; make -e ... PREPROCESS := $(CAT) # Any transducer-specific commands that must be done prior to passing the contents # of the file to be parsed on to the transducer (e.g. conversion to lower case, # or putting each word on a separate line). # Can be re-defined in the language/ parser-specific makefile. PROJECT = $(PYTHON) $(MYLANGDIR)/Project.py # Project certain fields into new tab-delimited records SORTFORMAT := SORTFORMAT is undefined # Defined below, but it might be necessary to provide an alternative for some parsers. #STD_CONVERTER1 # When doing comparisions, the transducer which makes the output of transducer #1 # look somewhat like that of transducer #2. Defined in the language-specific makefile. #STD_CONVERTER2 # When doing comparisions, the transducer which makes the output of transducer #2 # look somewhat like that of transducer #1. Defined in the language-specific makefile. #TST_DATA # Location of some language data to parse. Defined in the language-specific makefile. DEFS = "$(DIR1)" "$(DIR2)" "$(EXTRACT_TEXT)" "$(FULL_CONVERTER1)" "$(FULL_CONVERTER2)" \ "$(INPUT_FIELD)" "$(LANG_CODE)" "$(NO_SOLN_STRING)" "$(PARSER)" "$(PREPROCESS)" \ "$(SORTFORMAT)" "$(STD_CONVERTER1)" "$(STD_CONVERTER2)" "$(TOKENIZE)" \ "$(TST_DATA)" #-------------Definitions for parsing NULL_FSM = $(MYLANGDIR)/null.fsm export SORTFORMAT = $(MYLANGDIR)/SortFormat.awk #Define common steps in using a parser. Some of this is common to all parsers (or at # least has a default), some is common to a particular language, and some is parser-specific. # The ff. parsing steps are shared for many of the recipes below: PARSE_STEPS = $(CAT) $(TST_DATA) \ | $(PREPROCESS) \ | $(EXTRACT_TEXT) \ | $(TOKENIZE) \ | $(PARSER) #-------------SFM variables: #Normalization of SFM files. Normalization consists of ensuring that all fields occupy # a single line (but they may still be empty), that fields within a record are separated # by a single newline, and that records are separated by two newlines (i.e. by a blank # line). In addition, we concatenate multiple files together into one file in the local # directory. (We can't put them in /tmp, because different lexicons would have the same # filename.) # The caller must define the vars IN_REC_SEP and LEXICON_FILES, and the recipe to call # this (which should have as its prerequisite the user's set of lexicon files). # That recipe should also put the results into a file $(LOCAL_LEX). We would do that # here, but the recipe may need to do other things (e.g. normalize inconsistent use # of SFMs). # This could be a recipe, but 'make' doesn't seem to understand the use of variables # that are defined in another makefile as the prerequisites. # Steps: # (1) Cat all the files # (2) Turn each tab char, DOS-style carriage return, and space into a single # space char. # (3) it into a one record per line # (4) ...then translate it back into a one field per line format. # This has the side effect of converting any multi-line fields into # single-line fields. (It could be done more efficiently by eliminating # any newlines not immediately followed by an SFM, but is less easy to do # using standard Unix tools.) NORMALIZE_SFMS = \ REC_SEP=$(IN_REC_SEP) ; \ $(CAT) $(LEXICON_FILES) \ | $(TR) -s "\t\r " " " \ | $(ONE_REC_PER_LINE) \ | $(ONE_FLD_PER_LINE) #Define common steps to extract the contents of the SFM fields specified by the value # of the var FIELDS. Setting this var (and the var LOCAL_LEX) is the responsibility # of the caller. The SFMs should NOT include the backslash. If there is more than # one SFM in FIELDS, the list must be quoted. # Steps: # (1) Grep out all the lines containing the desired SFM (with added backslash) # (2) Convert any sequences of space chars to a single space char # (so we can eliminate any leading spaces in the next step) # (2) Select everything past the first space char (i.e. omit the SFM) # (3) Trim off any leading and trailing white space. # (4) Remove any empty lines # ASSUMPTION: normalization (NORMALIZE_SFMS) has been done first, so that fields # occupy a single line. # We do the above for each field, collecting them into a temp file. When we're done # we cat out that file. EXTRACT_FIELDS = \ $(ECHO) > $(TMP)/FieldData.tmp ; \ for SFM0 in $$FIELDS ; do \ SFM="^\\\\$$SFM0 " ; \ $(CAT) $(LOCAL_LEX) \ | $(GREP) "$$SFM" \ | $(TR) -s " " " " \ | $(CUT) -d" " -f2- -s \ | $(SED) -e"s/ *$$//" -e"s/^ *//" \ | $(GREP) -v "^$$" \ >> $(TMP)/FieldData.tmp ; \ done ; \ $(CAT) $(TMP)/FieldData.tmp #Collect all the words in some field, and tokenize them on white space. # Useful e.g. for submitting to a spell checker. The caller must define # the var 'FIELDS', which contains the SFM (without any backslash). TOKENIZE_FIELD = \ $(EXTRACT_FIELDS) \ | $(TR) -s "[:blank:][:punct:]" "\n" \ | $(SORT) -u \ > $S #Define common steps in doing counts. We count the number of occurences of strings # (defined as lines in the prerequisite file(s)) in the lexicon. # 'grep -c' will give a count of the number of times a regex appears in a file, and # 'grep -f' will use the strings in a file as the regexs; but it does not appear # possible to use run grep over an input once and derive the count for _each_ regex. # Instead, we use a special Python program. The regex's are taken from the lines # of the prerequisite file, but with $RXPREFIX to the left and $RXSUFFIX to the right. # (Both default to empty strings.) The lines to be counted are taken from the lexicon # ($(LOCAL_LEX)). # NOTE: the counts are _appended_ to whatever exists in the file before. COUNT_STRINGS = $(PYTHON) $(MYLANGDIR)/CountStrings.py COUNTS = \ $(CAT) $(LOCAL_LEX) \ | $(COUNT_STRINGS) -f $^ -l "$$RXPREFIX" -r "$$RXSUFFIX" \ >> $@ #Turn a stream of fields encoded in SFMs (each sfm on a separate line, although # a field may extend beyond a single line) into a stream of records (one # record per line, with tabs preceding every SFM except the first). # This uses a Python script to put a single newline before each record separator SFM, # and append all other SFMs after the immediately preceding record separator SFM, # preceded by a tab char. # I tried doing this with sed, and it was excruciatingly slow. # Needs a pipe before, and a "\" after the newline (or a pipe, if the next # cmd is on the same line). # Stable, in the sense that if the input is already in a one record per line format, # nothing will change. # Assumptions: # (1) Input uses Unix-style newlines # (2) There are no tab chars in the input. ONE_REC_PER_LINE = \ $(PYTHON) $(MYLANGDIR)/OneRecPerLine.py -r $$REC_SEP #Turn a stream of records encoded in SFMs (one record per line, with tab chars # preceding all but the first SFM) into a stream of fields, one field per line # (and no fields extending over more than one line), with two NLs (Unix-style) # separating each record from the preceding one (and possibly at TOF). # Steps: # (1) Put an (additional) NL after each line, to serve as a record separator # We could do this with 'ped', because 'ped' can directly introduce newlines # (which 'sed' cannot). But 'ped' messes up on certain character encodings, # so instead we introduce a tab character at each EOLN (we have to use a literal # tab, because not all versions of 'sed' understand "\t"). # (2) Convert all tabs into NLs. # Assumptions: # (1) All tab chars are immediately before an SFM. (We could use sed # instead of tr to ensure this, but given that these strucs come from # the above ONE_REC_PER_LINE process, that should be safe.) ONE_FLD_PER_LINE = \ $(SED) -e "s/$$/ /" \ | $(TR) "\t" "\n" # Modify any fields as required for the reversal. # This may involve re-naming SFMs. Two cases where this may be necessary are: # (1) To match the fields in some already reversed lexicon) # (2) To convert sub-senses into ordinary senses, thereby flattening the structure: # \sense building # \subsense house # \subsense outhouse # would become # \sense building # \sense house # \sense outhouse # Any required steps are defined in the language-specific makefile; # by default, this is just a 'cat': MOD_FIELDS4REVERSAL = $(CAT) #Do any required post-processing. By default this is a no-op, but it can be modified # on a language-particular basis: POSTPROCESS_LEX = $(CAT) #Delete any leading sense numbers. If this is to be done, it should be done # in the language-specific makefile; by default this step is just a 'cat': DELETE_SENSE_NUMBERS = $(CAT) #Make a given SFM (specified by a command line parameter) and its field be # the first SFM in the record. If more than one of the specified SFMs appear # in the record, the record will be split into multiple records, one for each # instance of the SFM: # \w na # \g (1) house (2) building # \ex Yax bo'on ta tut na. # will become # \g house # \w na # \ex Yax bo'on ta tut na. # # \g building # \w na # \ex Yax bo'on ta tut na. # If sub-senses exist, the should have been turned into ordinary senses in an earlier stage. # Caller must define OUT_REC_SEP. SET_FIRST_SFM = $(PYTHON) $(MYLANGDIR)/SetFirstSFM.py -r $(OUT_REC_SEP) -w Reversal.warn #Filter out any records whose headword contains a non-ASCII alphabetic char. RM_NONASCII_HEADWORDS = $(PYTHON) $(MYLANGDIR)/RmNonASCIIHeadWords.py #Chop the input into chunks, based on the first few letters of the headword. # Need to specify in command line the -r flag for record-initial SFM, and the -s flag for # the field containing the head word. CHOPDICT = $(PYTHON) $(MYLANGDIR)/ChopDict.py #-------------------------Paradigms: #-----------------Recipes--------------------- %.token: %.txt #Produce a sorted list of non-numeric tokens from a text file in a simple-minded way. # The alphabetic chars are lower-cased (using whatever encoding happens to be set). $(TR) -cs '[:alpha:]' '\n' < $< \ | $(TR) '[:upper:]' '[:lower:]' \ | $(SORT) -u \ | $(GREP) -v '^[[:digit:]+]$$' \ > $@ SFMPatterns.txt: $(LOCAL_LEX) #Create a list of unique SFM patterns within records. REC_SEP=$(IN_REC_SEP) ; \ $(CAT) $< \ | $(ONE_FLD_PER_LINE) \ | $(CUT) -d" " -f1 \ | $(ONE_REC_PER_LINE) \ | $(SORT) -u \ > $@ %.VerifySFMs: % $(RX_FILE) #Test the conformance of the SFM file % to the regular expr describing the licit record # structure, as given in the file %.rx. The result is output to the file %.VerifySFMs. # This is a copy of the original file, but with error msgs inserted; it can be read into # Shoebox and edited in place of the original SFM file. $(BANNER) $(VERIFY_SFMS) -r $(RX_FILE) -s $< \ > $@ $(XTOKENIZE_FSM): $(MYSRCDIR)/xfst/Tokenize.xfst #Beesley and Kartunnen tokenizer (see definition for XTOKENIZE_FSM, above). cd $(dir $<); \ $(XFST) -e "source $(notdir $<)" -stop # Do NOT continue above line onto following, so the resulting file gets copied to the correct dir: cp $(dir $<)/$@ . #--------------------Morphological parsing stuff-------------------- # -------------Comparisons between parsers: # 'cfraw', 'cfstd', and 'cffull' are intended to be called by the user; '_internal_compare' is not. # These recipes assume the following definitions: # NULL_FSM: Don't do any conversion of tags; defined above. # STD_CONVERTER{1, 2}: A "standard" converter to be tested (to convert from parser-specific tags # to standardized tags); defined in the calling makefile, normally the language-specific makefile. # FULL_CONVERTER{1, 2}: A "full" converter to be tested (to convert parser-specific tag notations # to a more standardized notation, doing more standardization than STD_CONVERTER); defined in # the calling makefile, normally the language-specific makefile. # DIR{1, 2}: The subdirs (relative to the calling makefile, usually the language-specific dir) # where the parser-specific converters are located. Needed so we can cd to that directory and # use its makefile. Also, we use the dir name as a filename for the output file, so that it appears # as a mnemonic in tkdiff's labels for the two files. cfraw: #Run '_internal_compare' (below) without any conversion of parser tags to common format. @$(MAKE) -sC .. -f makefile $(NULL_FSM) @$(MAKE) -s _internal_compare CONVERTER1=$(NULL_FSM) CONVERTER2=$(NULL_FSM) cfstd: #Run '_internal_compare' doing conversion of output formats to a standard format. # Have to make Xelda2Common.fsm and LDC2Common.fsm; can't simply list these as dependencies # here, because their dependencies are defined in the subdir's makefile. @$(MAKE) -sC $(DIR1) -f makefile $(STD_CONVERTER1) @$(MAKE) -sC $(DIR2) -f makefile $(STD_CONVERTER2) @$(MAKE) -s _internal_compare CONVERTER1=$(STD_CONVERTER1) CONVERTER2=$(STD_CONVERTER2) cffull: #Run '_internal_compare' converting tags to common tags (e.g. 'mf' to 'masc' and 'fem'). # See comment under 'cfstd' recipe re following calls to 'make'. @$(MAKE) -sC $(DIR1) -f makefile $(FULL_CONVERTER1) @$(MAKE) -sC $(DIR2) -f makefile $(FULL_CONVERTER2) @$(MAKE) -s _internal_compare CONVERTER1=$(FULL_CONVERTER1) CONVERTER2=$(FULL_CONVERTER2) _internal_compare: #Run two transducers on the same data, and diff the results. # Generally called from 'cfraw' etc., which set the comparision levels. # The default is to do no conversion (equivalent to 'cfraw'). # CONVERTER1 and CONVERTER2 are defined in the recipe that calls this recipe (i.e. cfraw, cfstd, or cffull) # DIR1 and DIR2 are defined in another makefile (see above). #Sanity tests: # @if [ ! $(TST_LANGUAGE)'' ] ; then $(ERROR) "TST_LANGUAGE must be defined" > /dev/tty ; exit 1 ; fi # Above line is taken care of by make env var 'MAKEFLAGS=--warn-undefined-variables' # @if [ ! -d ../$(TST_LANGUAGE) ] ; then $(ERROR) "'"$(TST_LANGUAGE)"'" "must be a subdir of "../$(CURDIR) > /dev/tty ; exit 1 ; fi # #What we really want is to ensure that the current dir ends in TST_LANGUAGE, but that's rather hard... @if [ ! $(CONVERTER1)'' ]; then $(ERROR) "CONVERTER1 must be defined" > /dev/tty ; exit 1 ; fi #This can happen if the user called this directly, without defining the CONVERTER1 var. #The var is defined by the recipes 'cfraw', 'cfstd', and 'cffull'. @if [ ! $(CONVERTER2)'' ] ; then $(ERROR) "CONVERTER2 must be defined" > /dev/tty ; exit 1 ; fi #See comments for previous cmd. #Run the tests: $(LOCALIZE); cd $(DIR1); \ $(MAKE) --no-print-directory -s SortFormat \ | $(EXPAND) \ | $(LOOKUP) $(CONVERTER1) -flags xmbTT \ | $(GREP) -v "^ *$$" \ | $(SORT) -u \ 1> $(TMP)/$(DIR1).out # "-s" flag on make means silent (so we don't get comments from 'make' which result in extraneous diffs) # Likewise the "--no-print-directory" flag (otherwise the directory changes show up in the output; can't # seem to change this by only piping stdout). # 'expand' converts tabs to spaces, since LOOKUP ignores anything to the right of a tab char. # The call to 'lookup' converts the parser's output to a std format. # Flags on 'lookup': # 'x': don't copy input to output # 'mb': assume multi-char symbols on both upper and lower sides (namely, the tags) # 'TT': don't insert anything between "lemma" and "tags" (otherwise we get tab separator # before the first thing it thinks is a tag, even splitting "comment" into "com ment") # 'grep' removes blank lines introduced by 'lookup' # (the "$" for "EOLN" needs to be doubled so 'make' won't interpret it as a var) # 'sort' puts the canonical parses in a canonical order (note that the first column is already # sorted by the SortFormat, but the second column may have become unsorted by virtue of being # converted to canonical form) # "1>" means redirect only stdout, not error msgs $(LOCALIZE); cd $(DIR2); \ $(MAKE) --no-print-directory -s SortFormat \ | $(EXPAND) \ | $(LOOKUP) $(CONVERTER2) -flags xmbTT \ | $(GREP) -v "^ *$$" \ | $(SORT) -u \ 1> $(TMP)/$(DIR2).out # See above for various command line flags, etc. # Finally, compare results: -$(VISUALDIFF) -iw $(TMP)/$(DIR1).out $(TMP)/$(DIR2).out & # The leading "-" on this cmd allows us to ignore (sort of) the exit status of 'diff', # which will be 1 if there were differences. The "-i" flag means ignore case diffs # (the LDC transducer wants lower-cased words, so we lower-case words going into it, # while xfst doesn't care, so we don't bother). The "w" flag means ignore any whitespace diffs. # Since vdiff is a separate app, we launch it with an "&", to allow continued use of the 'terminal'. # If vanilla 'diff' is used instead of a visual diff, pipe its output into s.t. like # $(SED) "s/^< /X: /; s/^> /L: /;" # to convert the ">" and "<" labels in the output of diff into labels (e.g. X for xfst, L for LDC). # -------------Parse free text XMLFormat: Definitions #Dependencies: # (1) Definitions: checks that the vars which must be defined are indeed defined # (2) $(PARSER): builds the parser, if necessary. # THIS HAS BEEN COMMENTED OUT because it causes a 'multiple target patterns' # error in 'make' (probably because $(PARSER) may include e.g. 'lookup' and # cmd-line parameters #Steps: # (1) Preprocessing, e.g. for the LDC Spanish xducer, we convert upper case to lower. # If nothing is needed, 'cat' is essentially a no-op. # (2) Get rid of the first three fields (offsets into audio, and speaker identification) # and bracket each word with 'XMLTAG' on a separate line. (We will later turn 'XMLTAG' # into real XML tags; if we tried to insert the XML tags at this point, 'tr' would # strip the '<' and '>'.). # (3) Get rid of punctuation. Ideally, we would do this with a command like # ` $(TR) -d "[:punct:]" # However, I can't get 'localize' to work correctly, so for the time being, I'm defining # the punctuation characters individually for each language using a var PUNCT. # (4) Parse. The transducer sends the line numbers to stderr, which we redirect to the bit-bucket. # (They cannot be relied on to stay in sync on the screen, and redirecting both to a file # seems to be problematic.) # (5) Convert 'XMLTAG' into real XML tags. (At present, we just put <word> tags around the words, # but there's room for further development, if we design a standard XML format for interlinear # text...) # The output is in a very simplified XML format. $(PARSE_STEPS) \ | $(AWK) '{if (/XMLTAG/) {printf("<\\word>\n<word>\n")} else print}' \ | $(MORE) ArabFormat: SanityTests #Output in "Arabic" format, so it can be used by Hubert's manual disambiguator tool. # The output format is as follows (see the file ArabFormat.txt # for a larger sample): # # INPUT STRING: لبنان # LOOK-UP WORD: lbnAn # Comment: # * SOLUTION 1: (lubonAn) lubonAn/NOUN_PROP # (GLOSS): + Lebanon + # SOLUTION 2: (libanAn) li/PREP+banAn/NOUN # (GLOSS): for/to + finger tips + # SOLUTION 3: (labanAn) la/EMPHATIC_PARTICLE+banAn/NOUN # (GLOSS): indeed/truly + finger tips + # If the parser doesn't succeed, there are no "solutions". The '*' is added by # human annotators for the "correct" parse, and is not created by the following. $(PARSE_STEPS) \ | $(AWK) -f $(ARABFORMAT) -v InputField=$(INPUT_FIELD) -v NoSolutionString=$(NO_SOLN_STRING) Definitions: #Check that the vars which must be defined are indeed defined. # The outer grep determines whether any vars are undefined (and causes an eventual exit # from 'make'), while the inner loop (executed only if the outer grep detects a problem) # tells exactly what definitions are missing. @if $(ECHO) $(DEFS) | $(GREP) undefined >> /dev/null ; then \ for D in $(DEFS) ; do \ if $(ECHO) $$D | $(GREP) undefined >> /dev/null ; then \ echo $$D ; \ fi ; \ done ; \ exit 1 ; \ fi SortFormat: SanityTests #Output in sorted format, to make for easy comparison between parsers. # The output has all lines sorted together (and no duplicate lines-- the primary reason # for sorting at this point is to eliminate dupes, to speed things up later; # after we put the parses into canonical form, we'll do another sort). # The output format is as follows: # <InputWord>\t<A Parse> # Note that if a word parses ambiguously, the individual parses are on separate lines. $(GREEN); $(ECHO) PARSER=$(PARSER) > /dev/tty ; $(BLACK) $(PARSE_STEPS) \ | $(GREP) -v "XMLTAG" \ | $(AWK) -f $(SORTFORMAT) -v InputField=$(INPUT_FIELD) -v NoSolutionString=$(NO_SOLN_STRING) \ | $(SORT) -u SanityTests: Definitions #Make sure the necessary variables have been defined to something that makes sense. # (The prerequisite of 'Definitions' ensures that all the necessary vars have values; # here we just ensure that some of those values make sense.) # The redirection of the error msgs to /dev/tty is necessary in case output has been otherwise redirected. # @if [ ! -f $(TST_DATA) ] ; then $(ERROR) "Cannot find test data file" $(TST_DATA) > /dev/tty ; exit 1 ; fi #-------------------Dictionary recipes---------------------------------- #Based on SFM dictionaries. Could instead be based on XML format SFMs: $(LOCAL_LEX) #Collect all the unique SFMs. Although we tell 'cut' to cut at a space char, it # seems to leave on some extraneous space chars. So we use 'tr' to remove them. $(BANNER) $(CAT) $(LOCAL_LEX) \ | $(GREP) "^\\\." \ | $(TR) -s " " " " \ | $(CUT) -d" " -f1 \ | $(SORT) -u \ > SFMs SFMCounts: SFMs #Construct a list of unique SFMs, each followed by a (tab separated) count. # The list is sorted from most common to least common. # The rx prefix does not need the backslash char, because it's already in the SFMs. $(BANNER) $(ECHO) > $@ #Wipe any previous results RXPREFIX="^" ; \ RXSUFFIX=" " ; \ $(COUNTS) POSs: $(LOCAL_LEX) #Construct a list of unique POSs. Ordinarily, this gives (and the following recipe counts) # whatever field POS_SFM is set to in the language-particular makefile. However, this can # be overridden on the command line as follows: # export POS_SFM=infv ; make -e POSs $(BANNER) #Ensure POS_SFM is set: if test "$(POS_SFM)" = '' ; then \ $(ERROR) "Variable POS_SFM must be set" ; \ exit 1 ; \ fi FIELDS=$(POS_SFM) ; \ $(EXTRACT_FIELDS) \ | $(SORT) -u \ | $(GREP) -vE "^[[:space:]]*$$" \ > POSs POSCounts: POSs #Construct a list of unique POSs, each followed by a (tab separated) count. # The list is sorted from most common to least common. $(BANNER) #Ensure POS_SFM is set: if test "$(POS_SFM)" = '' ; then \ $(ERROR) "Variable POS_SFM must be set" ; \ exit 1 ; \ fi $(ECHO) > $@ #Wipe any previous results RXPREFIX="\\$(POS_SFM) " ; \ RXSUFFIX="$$" ; \ $(COUNTS) Reversal.db: $(LOCAL_LEX) #Reverse the lexicon. # Input format: # Record separator = \id # Yoruba word = \w # English word = \d # Output format: # Record separator = \ENGL # Yoruba word = \YORU # English word = \ENGL # Steps: # (1) Modify any fields as required for the reversal. # This may involve re-naming SFMs (perhaps to match the fields in some already # reversed lexicon). The steps for this are defined in the language-specific # makefile, as the var MOD_FIELDS4REVERSAL. By default, this is just a 'cat'. # (2) Split multiple senses. The steps for this are defined in the language-specific # makefile, as the var DELETE_SENSE_NUMBERS (q.v.). By default, this is just a 'cat'. # (3) Concatenate each record onto a single line, with a tab char before every SFM # except the first (using ONE_REC_PER_LINE). # (4) Reversal: Put the sense (generally an English gloss) in first position on each line, # turning single records into multiple ones where there was > one gloss field (e.g. # where step (2) split multiple senses into multiple gloss fields). # (5) Sort it (by the English now), folding case (so upper and lower case words sort # adjacent). We do NOT sort uniquely; this would have the side effect of getting # rid of duplicate records, if there are any, but it may be better not to do that # (as a QC measure--although at present I don't have any such QC). # (6) Perform any post-processing. By default, POSTPROCESS_LEX is a no-op, but it can # be re-defined on a language-particular basis. # (7) Un-concatenate fields, putting one newline between fields of a single # record, and two newlines between records. $(BANNER) REC_SEP=$(IN_REC_SEP) ; \ $(CAT) $< \ | $(MOD_FIELDS4REVERSAL) \ | $(DELETE_SENSE_NUMBERS) \ | $(ONE_REC_PER_LINE) \ | $(SET_FIRST_SFM) \ | $(SORT) -f \ | $(POSTPROCESS_LEX) \ | $(ONE_FLD_PER_LINE) \ > $@ FrenchWords.txt: $(LOCAL_LEX) #Extract the words from all the French fields, tokenize them, and sort them uniquely # (for spell checking). We currently take glosses from only a single field, but # it might be desirable to loosen this restriction so we could include e.g. # translations of example sentences. # FIELDS=$(FRENCH_GLOSS_FIELD) ; FIELDS=$(FRENCH_SFMs) ; \ $(EXTRACT_FIELDS) \ | $(TR) -s "[:blank:][:punct:]" "\n" \ | $(SORT) -u \ > FrenchWords.txt FrenchMisspellings.txt: FrenchWords.txt #Run a French spell checker over the French words. # WARNING: 'aspell' may not be installed on every system. $(BANNER) $(SPELL) --lang=fr -l < $^ \ > $@ EnglishWords.txt: $(LOCAL_LEX) #Extract all the words from the French fields, tokenize them, and sort them uniquely # (for spell checking). See additional comments above re French words. FIELDS=$(ENGLISH_GLOSS_FIELD) ; \ $(EXTRACT_FIELDS) \ | $(TR) -s "[:blank:][:punct:]" "\n" \ | $(SORT) -u \ > EnglishWords.txt EnglishMisspellings.txt: EnglishWords.txt #Run a English spell checker over the English words. # WARNING: 'aspell' may not be installed on every system. $(BANNER) $(SPELL) --lang=en -l < $^ \ > $@ MissingXrefs.txt: $(LOCAL_LEX) #Find any cross-references that don't exist. # Requires two definitions: # XREF_ER_SFMS: List of SFMs used for xrefs (without backslashes) # NB: If there is more than one, they must be quoted # XREF_EE_SFM: SFM for field to which xrefs refer (without the backslash) # We assume all xrefs point to the same SFM (probably the citation form) $(BANNER) #First get a list of all referees (things to which an xref can refer): FIELDS=$(XREF_EE_SFM) ; \ $(EXTRACT_FIELDS) \ | $(SORT) -u \ > $(TMP)/xreferees.txt #Then get a list of all referers: FIELDS=$(XREF_ER_SFMS) ; \ $(EXTRACT_FIELDS) \ | $(SORT) -u \ > $(TMP)/xreferers.txt $(DIFF) $(TMP)/xreferees.txt $(TMP)/xreferers.txt \ | $(GREP) "> " \ | $(SED) -e"s/^> //" \ > $@ ReciprocalXrefs.txt: $(LOCAL_LEX) #For each cross-ref which exists and which is supposed to be reciprocal, # ensure that it exists in both directions. Requires two definitions: # XREF_RECIP_SFM: SFM used for reciprocal xrefs (without the backslash) # XREF_EE_SFM: SFM for field to which xrefs refer (without the backslash) # We assume all xrefs point to the same SFM (probably the citation form) # Algorithm: extract all the xreferer-->xreferee pairs into two sorted tab-delimited lists, # one as Referer\tReferee, the other as Referee\tReferer. Do a diff between the two lists; # any items in the second list but not in the first need to be added to the dictionary. # Create a file with the order REFERER - REFEREE: REC_SEP=$(IN_REC_SEP) ; \ $(CAT) $(LOCAL_LEX) \ | $(ONE_REC_PER_LINE) \ | $(PROJECT) -r $(XREF_RECIP_SFM) -o $(XREF_EE_SFM) \ | $(SORT) \ > $(TMP)/XRefs.sfm # Create a file with the reverse order (REFEREE - REFERER), but with the SFMs swapped. # First, the original XREF_RECIP_SFM fields: $(CUT) -f1 < $(TMP)/XRefs.sfm \ | $(SED) -e"s/^\\\\$(XREF_RECIP_SFM)/\\\\$(XREF_EE_SFM)/" \ > $(TMP)/XRefs1.sfm $(CUT) -f2 < $(TMP)/XRefs.sfm \ | $(SED) -e"s/^\\\\$(XREF_EE_SFM)/\\\\$(XREF_RECIP_SFM)/" \ > $(TMP)/XRefs2.sfm $(PASTE) $(TMP)/XRefs2.sfm $(TMP)/XRefs1.sfm \ | $(SORT) \ > $(TMP)/RevXRefs.sfm # Ff. cmd must be preceded by a '-', because 'diff' sets exit status to non-zero # if there are any diffs: - $(DIFF) $(TMP)/XRefs.sfm $(TMP)/RevXRefs.sfm \ | $(GREP) "^> " \ | $(SED) -e"s/^> //" \ > $(TMP)/MissingXRefs.sfm # ...But we want to reverse the order of fields in the latter file, so: $(CUT) -f1 < $(TMP)/MissingXRefs.sfm > $(TMP)/MissingXRefs1.sfm $(CUT) -f2 < $(TMP)/MissingXRefs.sfm > $(TMP)/MissingXRefs2.sfm $(PASTE) $(TMP)/MissingXRefs2.sfm $(TMP)/MissingXRefs1.sfm > $@ ValidateXMLLex: $(XML_DICT) $(XSD_FILE) #Run the XML version of the dictionaries through XML validation: $(XMLLINT) --noout --schema $(XSD_FILE) $(XML_DICT) install: #Copy some results over to a user's directory. # -rm $(INSTALL_DIR)/* DANGEROUS! cp $(OUTPUT_FILES) $(INSTALL_DIR) endif #ifndef LANG_MAKEFILE