;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;                                                                       ;;
;;;                Centre for Speech Technology Research                  ;;
;;;                     University of Edinburgh, UK                       ;;
;;;                       Copyright (c) 1996,1997                         ;;
;;;                        All Rights Reserved.                           ;;
;;;                                                                       ;;
;;;  Permission to use, copy, modify, distribute this software and its    ;;
;;;  documentation for research, educational and individual use only, is  ;;
;;;  hereby granted without fee, subject to the following conditions:     ;;
;;;   1. The code must retain the above copyright notice, this list of    ;;
;;;      conditions and the following disclaimer.                         ;;
;;;   2. Any modifications must be clearly marked as such.                ;;
;;;   3. Original authors' names are not deleted.                         ;;
;;;  This software may not be used for commercial purposes without        ;;
;;;  specific prior written permission from the authors.                  ;;
;;;                                                                       ;;
;;;  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        ;;
;;;  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      ;;
;;;  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   ;;
;;;  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     ;;
;;;  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    ;;
;;;  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   ;;
;;;  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          ;;
;;;  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       ;;
;;;  THIS SOFTWARE.                                                       ;;
;;;                                                                       ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;;  Various tokenizing functions and customization 

(define (Token utt)
  "(Token UTT)
Build a Word stream from the Token stream, analyzing compound words
numbers etc as tokens into words. Respects the Parameter Language
to choose the appropriate token to word module."
  (let ((rval (apply_method 'Token_Method utt)) ;; might be defined
	(language (Parameter.get 'Language)))
    (cond
     (rval rval)  ;; newer style
     ((or (eq? 'britishenglish language)
	  (eq? 'english language)
	  (eq? 'americanenglish language))
      (Token_English utt))
     ((eq? 'welsh language)
      (Token_Welsh utt))
     ((eq? 'spanish language)
      (Token_Any utt))
     ((eq? 'french language)
      (Token_Any utt))
     ((eq? 'any language)
      (Token_Any utt))
     (t
      (print "TOKEN: unknown language, using language-independent tokening")
      (Token_Any utt)))))

(define (english_token_to_words utt token name)
"(english_token_to_words UTT TOKEN NAME)
Returns a list of words for NAME from TOKEN in UTT.  This allows the
user to customize various non-local, multi-word, context dependent
translations of tokens into words.  If this function is unset only
the builtin translation rules are used, if this is set the builtin
rules are not used unless explicitly called. [see Token to word rules]"
 (cond
  ((string-matches name "[A-Z]*[\\$#\\\\Y][0-9,]+\\(\\.[0-9]+\\)?")
   ;; Some for of money (pounds or type of dollars)
   (let (amount type currency)
     (cond
      ((string-matches name ".*\\$.*")
       (set! amount (string-after name "$"))
       (set! type (string-before name "$"))
       (set! currency "dollar"))
      ((string-matches name ".*.*")
       (set! amount (string-after name ""))
       (set! type (string-before name ""))
       (set! currency "pound"))
      ((string-matches name ".*#.*")
       (set! amount (string-after name "#"))
       (set! type (string-before name "#"))
       (set! currency "pound"))
      ((string-matches name ".*Y[0-9].*")
       (set! amount (string-after name "Y"))
       (set! type (string-before name "Y"))
       (set! currency "yen"))
      ((string-matches name ".*\\\\.*")
       (set! amount (string-after name "\\"))
       (set! type (string-before name "\\"))
       (set! currency "yen"))
      (t
       ;; who knows
       (set! amount (string-after name "$"))
       (set! type (string-before name "$"))
       (set! currency "dollar")))
     (cond
      ((string-matches (utt.streamitem.feat utt token "n.name")
		       ".*illion.?")
       (append   ;; billions and billions
	(builtin_english_token_to_words utt token amount)
	(list (utt.streamitem.feat utt token "n.name")) ;; illion
	(token_money_expand type)
	(list (string-append currency "s"))))
      ((string-matches amount ".*\\...$")
       (append   ;; exactly two places after point
	(builtin_english_token_to_words 
	 utt token (string-before amount "."))
	(token_money_expand type)
	(if (or (string-matches amount "1\\..*")
		(string-equal currency "yen"))
	    (list currency)
	    (list (string-append currency "s")))
	(if (not (string-matches name ".*\\.00$"))
	    (builtin_english_token_to_words 
	     utt token (remove_leading_zeros (string-after amount ".")))
	    nil)))
      (t
       (append   ;; nothing after point or lots after point
	(builtin_english_token_to_words utt token amount)
	(token_money_expand type)
	(if (or (string-matches amount "1")
		(string-equal currency "yen"))
	    (list currency)
	    (list (string-append currency "s"))))))))
  ((and (string-matches (utt.streamitem.feat utt token "p.name")
			"[A-Z]*[\\$#][0-9,]+\\(\\.[0-9]+\\)?")
	(string-matches name ".*illion.?"))
   nil ;; dealt with on the previous symbol
   )
  ((string-matches name "[1-9][0-9]*/[1-9][0-9]*")
   (let ((numerator (string-before name "/"))
	 (denominator (string-after name "/"))
	 )
     (cond
      ((string-matches name "1/2")
       (list "half"))
      ((string-matches denominator "4")
       (append
	(builtin_english_token_to_words utt token numerator)
	(list "quarter")
	(if (string-equal numerator "1")
	    (list '((name "'s")(pos nnp)))
	    nil)))
      (t
       (append
	(builtin_english_token_to_words utt token numerator)
	(begin
	  (streamitem.set_feat token "token_pos" "ordinal")
	  (builtin_english_token_to_words utt token denominator))
	(if (string-equal numerator "1")
	    nil
	    (list '((name "'s")(pos nnp)))))))))
  ((and (string-matches name "No")
        (string-matches (utt.streamitem.feat utt token "n.name")
			"[0-9]+"))
   (list
    "number"))
  ((string-matches name ".*%$")
   (append
    (token_to_words utt token (string-before name "%"))
    (list "percent")))
  ((string-matches name "[0-9]+s")  ;; e.g. 1950s
   (streamitem.set_feat token "token_pos" "year")  ;; reasonable guess
   (append
    (builtin_english_token_to_words utt token (string-before name "s"))
    (list '((name "'s")(pos nnp))) ;; will get assimilated by postlexical rules
   ))
  ((string-matches name "[0-9]+'s")  ;; e.g. 1950's
   (streamitem.set_feat token "token_pos" "year")  ;; reasonable guess
   (append
    (builtin_english_token_to_words utt token (string-before name "'s"))
    (list '((name "'s")(pos nnp))) ;; will get assimilated by postlexical rules
   ))
  ((and (string-matches name ".*s$")
	(string-equal (utt.streamitem.feat utt token "punc") "'"))
   ;; potential possessive or may be end of a quote
   (if (token_no_starting_quote utt token)
       (streamitem.set_feat token "punc" ""))
   (builtin_english_token_to_words utt token name))
  ((string-matches name "[0-9]?[0-9][:\\.][0-9][0-9][AaPp][Mm]")  ;; time
   ;;  must be am/pm present for . to be acceptable separator
   (let (hours mins half sep (ttime (downcase name)))
     (if (string-matches ttime ".*:.*")
	 (set! sep ":")
	 (set! sep "."))
     (set! hours (string-before ttime sep))
     (set! mins (string-after ttime sep))
     (if (string-matches ttime "am")
	 (set! sep "am")
	 (set! sep "pm"))
     (set! mins (string-before mins sep))
     (append
      (builtin_english_token_to_words utt token hours)
      (builtin_english_token_to_words utt token mins)
      (list sep))))
  ((string-matches name "[0-9]?[0-9]:[0-9][0-9]")  ;; time
   (append
     (builtin_english_token_to_words 
      utt token (remove_leading_zeros (string-before name ":")))
     (builtin_english_token_to_words 
      utt token (remove_leading_zeros (string-after name ":")))))
  ((string-matches name "[0-9][0-9]:[0-9][0-9]:[0-9][0-9]")  ;; exact time
   (append
    (builtin_english_token_to_words 
     utt token (remove_leading_zeros (string-before name ":")))
    (list "hours")
    (builtin_english_token_to_words 
      utt token (remove_leading_zeros 
		 (string-before (string-after name ":") ":")))
    (list "minutes" "and")
    (builtin_english_token_to_words 
      utt token (remove_leading_zeros
		 (string-after (string-after name ":") ":")))
    (list "seconds")))
  ((string-matches name "[0-9][0-9]?/[0-9][0-9]?/[0-9][0-9]\\([0-9][0-9]\\)?")
   ;; date, say it as numbers to avoid American/British problem
   (let ((num1 (string-before name "/"))
	 (num2 (string-before (string-after name "/") "/"))
	 (year (string-after (string-after name "/") "/"))
	 day month)
     (streamitem.set_feat token "token_pos" "cardinal")
     (set! day (builtin_english_token_to_words utt token num1))
     (set! month (builtin_english_token_to_words utt token num2))
     (streamitem.set_feat token "token_pos" "year")
     (append
      day
      month
      (list '((name ",")(pbreak_scale 0.9)))
      (builtin_english_token_to_words utt token year))))
  ((string-matches name "[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]")
   (streamitem.set_feat token "token_pos" "digits")  ;; canonical phone number
   (append
    (builtin_english_token_to_words utt token (string-before name "-"))
    (list '((name ",")(pbreak_scale 1.0)))
    (builtin_english_token_to_words utt token (string-after name "-"))))
  ((string-matches name "[0-9]+-[0-9]+-[-0-9]+")
   ;; long distance number 
   (let ((r '(dummy)) (remainder name))
     (streamitem.set_feat token "token_pos" "digits")
     (while (> (length remainder) 0)
       (if (string-matches remainder "[0-9]+")
	   (set! r (append r 
		       (builtin_english_token_to_words 
			utt token remainder)))
	   (set! r (append r 
			   (builtin_english_token_to_words 
			    utt token (string-before remainder "-")))))
       (set! remainder (string-after remainder "-"))
       (if (> (length remainder) 0)
	   (set! r (append r (list '((name ",")(pbreak_scale 1.0)))))))
     (cdr r))
   )
  ((and (string-matches name "[0-9][0-9][0-9]")
	(string-matches (utt.streamitem.feat utt token "n.name")
			"[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]"))
     (streamitem.set_feat token "token_pos" "digits")
     (builtin_english_token_to_words utt token name))
  ((string-matches name "[0-9]+-[0-9]+")
   (let ((tokpos))
     (streamitem.set_name token (string-before name "-"))
     (set! tokpos (wagon utt token 
			 (car (cdr (assoc "[0-9]+" token_pos_cart_trees)))))
     (streamitem.set_feat token "token_pos" (car tokpos))
     (append
      (builtin_english_token_to_words utt token (string-before name "-"))
      (list "to")
      (builtin_english_token_to_words utt token (string-after name "-")))))
  ((string-matches name "\\(II?I?\\|IV\\|VI?I?I?\\|IX\\|X[VIX]*\\)")
   ;; Roman numerals
   (let ((tp (utt.streamitem.feat utt token "token_pos")))
     (cond
      ((string-matches tp "century");; always believe this
       (streamitem.set_feat token "token_pos" "ordinal")
       (if (or (string-equal "1" (tok_rex utt token))
	       (utt.streamitem.feat utt token "p.lisp_tok_rex_names"))
	   (append
	    (list "the")
	    (builtin_english_token_to_words 
	     utt token (tok_roman_to_numstring name)))
	   (builtin_english_token_to_words 
	    utt token (tok_roman_to_numstring name))))
      ((string-matches name "[IVX]");; be *very* wary of this one
       (if (and (string-equal 
		 "1" (utt.streamitem.feat utt token "p.lisp_tok_section_name"))
		(string-matches tp "number"))
	   (builtin_english_token_to_words 
	    utt token (tok_roman_to_numstring name))
	   (tok_string_as_letters name)))
      ((string-matches tp "number")
       (streamitem.set_feat token "token_pos" "cardinal")
       (builtin_english_token_to_words 
	utt token (tok_roman_to_numstring name)))
      (t;; else its a letter
       (tok_string_as_letters name)))))
  ((and (string-matches name "pp")
	(string-matches (utt.streamitem.feat utt token "n.name")
			"[0-9]+-[0-9]+"))
   (list "pages"))
  ((and (string-matches name "ss")
	(string-matches (utt.streamitem.feat utt token "n.name")
			"[0-9]+-[0-9]+"))
   (list "sections"))
  ((string-matches name "_____+")
   (list "line" "of" "underscores"))
  ((string-matches name "=====+")
   (list "line" "of" "equals"))
  ((string-matches name "-----+")
   (list "line" "of" "hyphens"))
  ((string-matches name "\\*\\*\\*\\*\\*+")
   (list "line" "of" "asterisks"))
  ((string-matches name "--+")
   (list '((name ",")(pbreak_scale 1.0))))
  ((string-matches name ".*--+.*")
   (append
    (builtin_english_token_to_words utt token (string-before name "--"))
    (list '((name ",")(pbreak_scale 1.0)))
    (builtin_english_token_to_words utt token (string-after name "--"))))
  ((string-matches name "[A-Z][A-Z]?&[A-Z][A-Z]?")
   (append
    (tok_string_as_letters (string-before name "&"))
    (list "and")
    (tok_string_as_letters (string-after name "&"))))
  ((or (string-matches name "[A-Z][A-Z]+s")
       (string-matches name "[BCDEFGHJKLMNOPQRSTVWXYZ]+s"))
   (append
    (builtin_english_token_to_words utt token (string-before name "s"))
    (list '((name "'s")(pos nnp))) ;; will get assimilated by postlexical rules
    ))
  ((string-matches name "<.*@.*>")  ;; quoted e-mail
   (append 
    (builtin_english_token_to_words
     utt token (string-after (string-before name "@") "<"))
    (list "at")
    (builtin_english_token_to_words
     utt token (string-before (string-after name "@") ">"))))
  ((string-matches name ".*@.*")  ;; e-mail
   (append 
    (builtin_english_token_to_words
     utt token (string-before name "@"))
    (list "at")
    (builtin_english_token_to_words
     utt token (string-after name "@") ">")))
  ((string-matches name "\\([dD][Rr]\\|[Ss][tT]\\)")
   (if (string-equal (utt.streamitem.feat utt token "token_pos") "street")
       (if (string-matches name "[dD][rR]")
	   (list "drive")
	   (list "street"))
       (if (string-matches name "[dD][rR]")  ;; default on title side
	   (list "doctor")
	   (list "saint"))))
  ((string-matches name "[Cc]alif")  ;; hopelessly specific ...
   (list 
    "california"))
  (t
   (builtin_english_token_to_words utt token name))))

;;; This is set as the default
(defvar token_to_words english_token_to_words)

(defvar token.punctuation "\"'`.,:;!?(){}[]"
  "token.punctuation
A string of characters which are to be treated as punctuation when
tokenizing text.  Punctuation symbols will be removed from the text
of the token and made available through the \"punctuation\" feature.
[see Tokenizing]")
(defvar token.prepunctuation "\"'`({["
  "token.prepunctuation
A string of characters which are to be treated as preceding punctuation
when tokenizing text.  Prepunctuation symbols will be removed from the text
of the token and made available through the \"prepunctuation\" feature.
[see Tokenizing]")
(defvar token.whitespace " \t\n\r"
  "token.whitespace
A string of characters which are to be treated as whitespace when
tokenizing text.  Whitespace is treated as a separator and removed
from the text of a token and made available through the \"whitespace\"
feature.  [see Tokenizing]")
(defvar token.singlecharsymbols ""
  "token.singlecharsymbols
Characters which have always to be split as tokens.  This would be
usual is standard text, but is useful in parsing some types of
file. [see Tokenizing]")

(defvar token.letter_pos 'nn
  "token.letter_pos
The part of speech tag (valid for your part of speech tagger) for
individual letters.  When the tokenizer decide to pronounce a token
as a list of letters this tag is added to each letter in the list.  
Note this should be from the part of speech set used in your tagger 
which may not be the same one that appears in the actual lexical 
entry (if you map them afterwards).  This specifically allows \"a\"
to come out as ae rather than @.")

(defvar token.unknown_word_name "unknown"
  "token.unknown_word_name
When all else fails and a pronunciation for a word or character can't
be found this word will be said instead.  If you make this \"\" them
the unknown word will simple be omitted.  This will only
really be called when there is a bug in the lexicon and characters
are missing from the lexicon.  Note this word should be in the lexicon.")

(def_feature_docstring
  'Token.punc
  "Token.punc
Succeeding punctuation symbol found after token in original 
string/file.")
(def_feature_docstring
  'Token.whitespace
  "Token.whitespace
Whitespace found before token in original string/file.")
(def_feature_docstring
  'Token.prepunctuation
  "Token.prepunctuation
Preceeding puctuation symbol found before token in original string/file.")

(require 'tokenpos)
;;;
;;;  Token pos are gross level part of speech tags which help decide
;;;  pronunciation of tokens (particular expansion of Tokens into words)
;;;  The most obvious example is identifying number types (ordinals,
;;;  years, digits or numbers).
;;;
(defvar english_token_pos_cart_trees
  '(
    ;;  Format is (Regex Tree)
    ("[0-9]+" 
((pp.lisp_token_pos_guess is day)
 ((ordinal))
 ((n.lisp_token_pos_guess in (miles months hours days million ft years states points m km metres kilometres Hz KHz feet ft foot ton tons tonne tonnes bytes MB mb MBS mbs))
  ((cardinal))
  ((p.lisp_token_pos_guess in (page than some at))
   ((cardinal))
   ((pp.lisp_token_pos_guess in (he many))
    ((cardinal))
    ((p.lisp_token_pos_guess is late)
     ((year))
     ((n.lisp_token_pos_guess is pages)
      ((cardinal))
      ((p.lisp_token_pos_guess is age)
       ((cardinal))
       ((p.lisp_token_pos_guess is just)
	((cardinal))
	((p.lisp_token_pos_guess is now)
	 ((cardinal))
	 ((p.lisp_token_pos_guess is on)
	  ((ordinal))
	  ((p.lisp_token_pos_guess is since)
	   ((year))
	   ((pp.lisp_token_pos_guess is has)
	    ((cardinal))
	    ((pp.lisp_token_pos_guess is says)
	     ((cardinal))
	     ((p.lisp_token_pos_guess is early)
	      ((year))
	      ((p.lisp_token_pos_guess is day)
	       ((ordinal))
	       ((n.lisp_token_pos_guess is people)
		((cardinal))
		((p.lisp_token_pos_guess is about)
		 ((cardinal))
		 ((p.lisp_token_pos_guess is no)
		  ((cardinal))
		  ((n.lisp_token_pos_guess is who)
		   ((cardinal))
		   ((pp.lisp_token_pos_guess is i)
		    ((cardinal))
		    ((p.lisp_token_pos_guess is all)
		     ((cardinal))
		     ((p.lisp_token_pos_guess is with)
		      ((cardinal))
		      ((n.lisp_token_pos_guess is former)
		       ((cardinal))
		       ((pp.lisp_token_pos_guess is billion)
			((year))
			((p.lisp_token_pos_guess is least)
			 ((cardinal))
			 ((pp.lisp_token_pos_guess is age)
			  ((cardinal))
			  ((n.lisp_token_pos_guess is campaign)
			   ((year))
			   ((nn.lisp_token_pos_guess is f)
			    ((cardinal))
			    ((lisp_num_digits < 3.8)
			     ((pp.lisp_token_pos_guess is out)
			      ((cardinal))
			      ((nn.lisp_token_pos_guess is years)
			       ((cardinal))
			       ((p.lisp_token_pos_guess is that)
				((cardinal))
				((n.lisp_token_pos_guess is other)
				 ((cardinal))
				 ((p.lisp_token_pos_guess is in)
				  ((cardinal))
				  ((pp.lisp_token_pos_guess is up)
				   ((cardinal))
				   ((p.lisp_token_pos_guess is after)
				    ((cardinal))
				    ((p.lisp_token_pos_guess is around)
				     ((cardinal))
				     ((p.lisp_token_pos_guess is to)
				      ((cardinal))
				      ((p.lisp_token_pos_guess is from)
				       ((cardinal))
				       ((n.lisp_token_pos_guess is members)
					((cardinal))
					((pp.lisp_token_pos_guess is that)
					 ((cardinal))
					 ((p.lisp_token_pos_guess is its)
					  ((cardinal))
					  ((n.lisp_token_pos_guess is of)
					   ((cardinal))
					   ((p.lisp_token_pos_guess is by)
					    ((cardinal))
					    ((p.lisp_token_pos_guess is month)
					     ((name < 85.9)
					      ((ordinal))
					      ((year)))
					     ((nn.lisp_token_pos_guess is of)
					      ((cardinal))
					      ((nn.lisp_token_pos_guess is as)
					       ((cardinal))
					       ((n.lisp_token_pos_guess is a)
						((cardinal))
						((nn.lisp_token_pos_guess is his)
						 ((cardinal))
						 ((pp.lisp_token_pos_guess is an)
						  ((cardinal))
						  ((n.lisp_token_pos_guess is to)
						   ((cardinal))
						   ((pp.lisp_token_pos_guess is and)
						    ((cardinal))
						    ((nn.lisp_token_pos_guess is for)
						     ((cardinal))
						     ((nn.lisp_token_pos_guess is were)
						      ((cardinal))
						      ((nn.lisp_token_pos_guess is to)
						       ((cardinal))
						       ((nn.lisp_token_pos_guess is and)
							((cardinal))
							((nn.lisp_token_pos_guess is in)
							 ((cardinal))
							 ((n.lisp_token_pos_guess is the)
							  ((cardinal))
							  ((pp.lisp_token_pos_guess is to)
							   ((cardinal))
							   ((n.lisp_token_pos_guess is is)
							    ((cardinal))
							    ((nn.lisp_token_pos_guess is that)
							     ((cardinal))
							     ((pp.lisp_token_pos_guess is the)
							      ((cardinal))
							      ((p.lisp_token_pos_guess is flight)
							       ((digits))
							       ((n.lisp_token_pos_guess is has)
								((cardinal))
								((pp.lisp_token_pos_guess is of)
								 ((cardinal))
								 ((n.lisp_token_pos_guess is and)
								  ((cardinal))
								  ((pp.lisp_token_pos_guess is in)
								   ((cardinal))
								   ((n.lisp_token_pos_guess is was)
								    ((cardinal))
								    ((nn.lisp_token_pos_guess is a)
								     ((cardinal))
								     ((nn.lisp_token_pos_guess is the)
								      ((cardinal))
								      ((n.lisp_token_pos_guess is in)
								       ((cardinal))
								       ((p.lisp_token_pos_guess is and)
									((cardinal))
									((p.lisp_token_pos_guess is for)
									 ((cardinal))
									 ((nn.lisp_token_pos_guess is on)
									  ((cardinal))
									  ((n.lisp_token_pos_guess is or)
									   ((cardinal))
									   ((n.lisp_token_pos_guess is month)
									    ((lisp_month_range is 0)
									     ((cardinal))
									     ((ordinal)))
									    ((p.lisp_token_pos_guess is of)
									     ((cardinal))
									     ((nn.lisp_token_pos_guess is est)
									      ((cardinal))
									      ((lisp_num_digits < 1.2)
									       ((name < 5.4)
										((nn.lisp_token_pos_guess is numeric)
										 ((cardinal))
										 ((name < 0.5)
										  ((cardinal))
										  ((name < 1.4)
										   ((cardinal))
										   ((pp.lisp_token_pos_guess is numeric)
										    ((cardinal))
										    ((p.lisp_token_pos_guess is numeric)
										     ((nn.lisp_token_pos_guess is from)
										      ((digits))
										      ((cardinal)))
										     ((cardinal)))))))
										((cardinal)))
									       ((lisp_month_range is 0)
										((n.lisp_token_zerostart is 0)
										 ((n.lisp_token_pos_guess is numeric)
										  ((digits))
										  ((nn.lisp_token_pos_guess is _other_)
										   ((cardinal))
										   ((pp.lisp_token_pos_guess is numeric)
										    ((cardinal))
										    ((n.lisp_token_pos_guess is _other_)
										     ((name < 95.4)
										      ((cardinal))
										      ((name < 185.4)
										       ((cardinal))
										       ((name < 339.6)
											((cardinal))
											((name < 410.4)
											 ((digits))
											 ((cardinal))))))
										     ((n.lisp_token_pos_guess is sym)
										      ((nn.lisp_token_pos_guess is numeric)
										       ((digits))
										       ((name < 181.6)
											((cardinal))
											((digits))))
										      ((cardinal)))))))
										 ((cardinal)))
										((cardinal)))))))))))))))))))))))))))))))))))))))))))))))))))))
			     ((n.lisp_token_pos_guess is by)
			      ((year))
			      ((n.lisp_token_pos_guess is as)
			       ((year))
			       ((p.lisp_token_pos_guess is by)
				((year))
				((nn.lisp_token_pos_guess is his)
				 ((year))
				 ((p.lisp_token_pos_guess is until)
				  ((year))
				  ((n.lisp_token_pos_guess is when)
				   ((year))
				   ((nn.lisp_token_pos_guess is was)
				    ((year))
				    ((nn.lisp_token_pos_guess is has)
				     ((year))
				     ((nn.lisp_token_pos_guess is he)
				      ((year))
				      ((p.lisp_token_pos_guess is month)
				       ((year))
				       ((n.lisp_token_pos_guess is after)
					((year))
					((nn.lisp_token_pos_guess is to)
					 ((year))
					 ((lisp_token_zerostart is 0)
					  ((p.lisp_token_pos_guess is between)
					   ((year))
					   ((nn.lisp_token_pos_guess is that)
					    ((year))
					    ((p.lisp_token_pos_guess is his)
					     ((year))
					     ((n.lisp_token_pos_guess is in)
					      ((year))
					      ((n.lisp_token_pos_guess is he)
					       ((year))
					       ((pp.lisp_token_pos_guess is month)
						((year))
						((n.lisp_token_pos_guess is with)
						 ((year))
						 ((p.lisp_token_pos_guess is from)
						  ((year))
						  ((n.lisp_token_pos_guess is at)
						   ((year))
						   ((pp.lisp_token_pos_guess is year)
						    ((year))
						    ((p.lisp_token_pos_guess is the)
						     ((year))
						     ((p.lisp_token_zerostart is 0)
						      ((n.lisp_token_pos_guess is it)
						       ((year))
						       ((n.lisp_token_pos_guess is the)
							((year))
							((p.lisp_token_pos_guess is of)
							 ((year))
							 ((p.lisp_token_pos_guess is to)
							  ((year))
							  ((nn.lisp_token_pos_guess is is)
							   ((year))
							   ((p.lisp_token_pos_guess is and)
							    ((year))
							    ((pp.lisp_token_zerostart is 0)
							     ((n.lisp_token_pos_guess is to)
							      ((year))
							      ((n.lisp_token_pos_guess is that)
							       ((year))
							       ((n.lisp_token_pos_guess is for)
								((year))
								((p.lisp_token_pos_guess is a)
								 ((year))
								 ((n.lisp_token_pos_guess is and)
								  ((year))
								  ((nn.lisp_token_pos_guess is the)
								   ((year))
								   ((nn.lisp_token_pos_guess is a)
								    ((year))
								    ((n.lisp_token_pos_guess is a)
								     ((year))
								     ((p.lisp_token_pos_guess is year)
								      ((year))
								      ((nn.lisp_token_pos_guess is and)
								       ((year))
								       ((n.lisp_token_pos_guess is but)
									((year))
									((p.lisp_token_pos_guess is in)
									 ((year))
									 ((n.lisp_token_pos_guess is month)
									  ((cardinal))
									  ((pp.lisp_token_pos_guess is the)
									   ((year))
									   ((p.lisp_token_pos_guess is numeric)
									    ((nn.lisp_token_pos_guess is of)
									     ((digits))
									     ((n.lisp_token_pos_guess is from)
									      ((digits))
									      ((nn.lisp_token_pos_guess is sym)
									       ((digits))
									       ((nn.lisp_token_pos_guess is _other_)
										((digits))
										((year))))))
									    ((n.lisp_token_pos_guess is from)
									     ((year))
									     ((n.lisp_token_pos_guess is numeric)
									      ((digits))
									      ((year)))))))))))))))))))
							     ((year)))))))))
						      ((year))))))))))))))
					  ((digits))))))))))))))))))))))))))))))))))))))))))))
     )
    ("\\(II?I?\\|IV\\|VI?I?I?\\|IX\\|X[VIX]*\\)" ;; Roman numerals
((p.lisp_tok_rex_names is 0)
 ((lisp_num_digits is 5)
  ((number))
  ((lisp_num_digits is 4)
   ((number))
   ((nn.lisp_num_digits is 13)
    ((number))
    ((p.lisp_num_digits is 7)
     ((number))
     ((p.lisp_tok_section_name is 0)
      ((lisp_tok_rex is 0)
       ((lisp_num_digits is 3)
        ((p.lisp_num_digits is 4)
         ((number))
         ((nn.lisp_num_digits is 4)
          ((number))
          ((n.lisp_num_digits is 4)
           ((number))
           ((pp.lisp_num_digits is 3)
            ((number))
            ((p.lisp_num_digits is 2)
             ((letter))
             ((nn.lisp_num_digits is 2)
              ((letter))
              ((n.cap is 0) ((letter)) ((number)))))))))
        ((nn.lisp_num_digits is 11)
         ((letter))
         ((lisp_num_digits is 1)
          ((pp.lisp_num_digits is 9)
           ((letter))
           ((p.lisp_num_digits is 9)
            ((letter))
            ((n.lisp_num_digits is 6)
             ((letter))
             ((pp.lisp_num_digits is 6)
              ((letter))
              ((pp.cap is 0)
               ((n.cap is 0)
                ((p.lisp_num_digits is 1)
                 ((letter))
                 ((n.lisp_num_digits is 4) ((letter)) ((letter))))
                ((letter)))
               ((letter)))))))
          ((p.lisp_num_digits is 10)
           ((number))
           ((n.lisp_num_digits is 8)
            ((number))
            ((pp.lisp_num_digits is 9)
             ((number))
             ((nn.lisp_num_digits is 5)
              ((number))
              ((n.lisp_num_digits is 4) ((number)) ((letter))))))))))
       ((letter)))
      ((number)))))))
 ((century))))
   ("\\([dD][Rr]\\|[Ss][tT]\\)"
    ((n.addr is 0)
     ((p.cap is 1)
      ((street))
      ((p.name matches "[0-9]*\\(1[sS][tT]\\|2[nN][dD]\\|3[rR][dD]\\|[0-9][tT][hH]\\)")
       ((street))
       ((title))))
     ((punc matches ".*,.*")
      ((street))
      ((p.punc matches ".*,.*")
       ((title))
       ((n.cap is 0)
	((street))
	((p.cap is 0)
	 ((p.name matches "[0-9]*\\(1[sS][tT]\\|2[nN][dD]\\|3[rR][dD]\\|[0-9][tT][hH]\\)")
	  ((street))
	  ((title)))
	 ((pp.name matches "[1-9][0-9]+")
	  ((street))
	  ((title)))))))))
))

(defvar token_pos_cart_trees
  english_token_pos_cart_trees
  "token_pos_cart_trees
This is a list of pairs or regex plus CART tree.  Tokens that match
the regex will have the CART tree aplied, setting the result as
the token_pos feature on the token.  The list is checked in order
and only the first match will be applied.")

(provide 'token)
