40 #include "EST_Ngrammar.h"
43 int main(
int argc,
char **argv)
48 EST_String wordlist_file, script_file, in_file, format;
56 bool per_file_stats=
false;
61 double raw_entropy,count,entropy,perplexity,total_raw_H,total_count;
67 EST_String(
"[input file0] [input file1] ...\n")+
68 "-g <ifile> grammar file (required)\n"+
69 "-w <ifile> filename containing word list (required for some grammar formats)\n"+
70 "-S <ifile> script file\n"+
71 "-raw_stats print unnormalised entropy and sample count\n"+
72 "-brief print results in brief format\n"+
73 "-f print stats for each file\n"+
75 "-input_format <string>\n"+
76 " format of input data (default sentence_per_line)\n"+
77 " may also be sentence_per_file, or ngram_per_line.\n"+
80 "-prev_tag <string>\n"+
81 " tag before sentence start\n"+
82 "-prev_prev_tag <string>\n"+
83 " all words before 'prev_tag'\n"+
84 "-last_tag <string>\n"+
85 " after sentence end\n"+
87 " use default tags of "+SENTENCE_START_MARKER+
88 ","+SENTENCE_END_MARKER+
" and "+SENTENCE_END_MARKER+
"\n"+
94 wordlist_file = al.
val(
"-w");
100 per_file_stats =
true;
101 if (al.
present(
"-input_format"))
102 input_format = al.
val(
"-input_format");
104 input_format =
"sentence_per_line";
113 if (al.
present(
"-default_tags"))
115 prev_tag = SENTENCE_START_MARKER;
116 prev_prev_tag = SENTENCE_END_MARKER;
117 last_tag = SENTENCE_END_MARKER;
122 if (al.
present(
"-default_tags"))
123 cerr <<
"test_ngram: WARNING : -prev_tag overrides -default_tags"
125 prev_tag = al.
val(
"-prev_tag");
128 if (al.
present(
"-prev_prev_tag"))
130 if (al.
present(
"-default_tags"))
131 cerr <<
"test_ngram: WARNING : -prev_prev_tag overrides -default_tags" << endl;
132 prev_prev_tag = al.
val(
"-prev_prev_tag");
137 if (al.
present(
"-default_tags"))
138 cerr <<
"test_ngram: WARNING : -last_tag overrides -default_tags" << endl;
139 last_tag = al.
val(
"-last_tag");
142 if ( ( (prev_tag==
"") || (prev_prev_tag==
"") || (last_tag==
"") )
143 && ( (prev_tag!=
"") || (prev_prev_tag!=
"") || (last_tag!=
"") ) )
145 cerr <<
"test_ngram: ERROR : if any tags are given, ALL must be given" << endl;
153 script_file = al.
val(
"-S");
157 cerr <<
"test_ngram: Could not read script from file "
158 << script_file << endl;
164 in_file = al.
val(
"-g");
167 cerr <<
"test_ngram: Must give a grammar filename using -g" << endl;
173 if(script.head()==NULL)
176 for(p=files.head();p!=0;p=p->next())
180 if(script.head() == NULL)
182 cerr <<
"test_ngram: No test files given" << endl;
186 if (wordlist_file !=
"")
191 cerr <<
"test_ngram: Could not read wordlist from file " << wordlist_file
197 if (ngrammar.load(in_file,wordlist) != format_ok)
199 cerr <<
"test_ngram: Failed to load grammar" << endl;
205 if (ngrammar.load(in_file) != format_ok)
207 cerr <<
"test_ngram: Failed to load grammar" << endl;
214 cout <<
"Ngram Test Results" << endl;
215 cout <<
"==================" << endl;
218 for (p = script.head(); p; p = p->next())
221 if (test_stats(ngrammar,
229 total_raw_H += raw_entropy;
230 total_count += count;
235 cout <<
basename(script(p)) <<
" \t";
237 cout << script(p) << endl;
242 cout << raw_entropy <<
" " << count <<
" ";
245 cout <<
" raw entropy " << raw_entropy << endl;
246 cout <<
" count " << count << endl;
251 cout << entropy <<
" " << perplexity << endl;
254 cout <<
" entropy " << entropy << endl;
255 cout <<
" perplexity " << perplexity << endl << endl;
261 cerr <<
"test_ngram: WARNING : file '" << script(p)
262 <<
"' could not be processed" << endl;
269 cout <<
"Summary for grammar " << in_file << endl;
272 cout <<
"summary \t";
277 cout << total_raw_H <<
" " << total_count <<
" ";
280 cout <<
" raw entropy " << total_raw_H << endl;
281 cout <<
" count " << total_count << endl;
286 cout << total_raw_H / total_count;
287 cout <<
" " << pow(2.0,total_raw_H / total_count);
292 cout <<
" entropy " << total_raw_H / total_count << endl;
293 cout <<
" perplexity " << pow(2.0,total_raw_H / total_count);
299 cerr <<
"test_ngram: No data processed" << endl;
const int present(const K &rkey) const
Returns true if key is present.
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
EST_String basename(EST_String full, EST_String ext="")
This acts like the bourne shell basename command. By default, it strips any leading path from a strin...
void append(const T &item)
add item onto end of list
EST_read_status load_StrList(EST_String filename, EST_StrList &l)
Load tokens from a file and return them in a EST_StrList.