Edinburgh Speech Tools  2.1-release
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
ngram_test_main.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1995,1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Authors: Simon King */
34 /* Date : July 1995 */
35 /*-----------------------------------------------------------------------*/
36 /* EST_Ngrammar test program */
37 /* */
38 /*=======================================================================*/
39 #include "EST.h"
40 #include "EST_Ngrammar.h"
41 
42 
43 int main(int argc, char **argv)
44 {
45  //int order;
46  EST_StrList files,script;
47  EST_Option al, op;
48  EST_String wordlist_file, script_file, in_file, format;
49  EST_String prev_tag, prev_prev_tag, last_tag;
50  EST_Litem *p;
51  //EST_Ngrammar::representation_t representation =
52  //EST_Ngrammar::dense;
53 
54  EST_StrList wordlist;
55  EST_Ngrammar ngrammar;
56  bool per_file_stats=false;
57  bool raw_stats=false;
58  bool brief=false;
59  EST_String input_format;
60 
61  double raw_entropy,count,entropy,perplexity,total_raw_H,total_count;
62  total_count = 0;
63  total_raw_H = 0;
64 
65  parse_command_line
66  (argc, argv,
67  EST_String("[input file0] [input file1] ...\n")+
68  "-g <ifile> grammar file (required)\n"+
69  "-w <ifile> filename containing word list (required for some grammar formats)\n"+
70  "-S <ifile> script file\n"+
71  "-raw_stats print unnormalised entropy and sample count\n"+
72  "-brief print results in brief format\n"+
73  "-f print stats for each file\n"+
74  "\n"+
75  "-input_format <string>\n"+
76  " format of input data (default sentence_per_line)\n"+
77  " may also be sentence_per_file, or ngram_per_line.\n"+
78  "\n"+
79  "Pseudo-words :\n"+
80  "-prev_tag <string>\n"+
81  " tag before sentence start\n"+
82  "-prev_prev_tag <string>\n"+
83  " all words before 'prev_tag'\n"+
84  "-last_tag <string>\n"+
85  " after sentence end\n"+
86  "-default_tags\n"+
87  " use default tags of "+SENTENCE_START_MARKER+
88  ","+SENTENCE_END_MARKER+" and "+SENTENCE_END_MARKER+"\n"+
89  " respectively\n",
90  files, al);
91 
92 
93  if (al.present("-w"))
94  wordlist_file = al.val("-w");
95  else{
96  wordlist_file = "";
97  }
98 
99  if (al.present("-f"))
100  per_file_stats = true;
101  if (al.present("-input_format"))
102  input_format = al.val("-input_format");
103  else
104  input_format = "sentence_per_line";
105 
106  if ( al.present("-raw_stats") || al.present("-r"))
107  raw_stats = true;
108 
109  if ( al.present("-brief") || al.present("-b") )
110  brief = true;
111 
112 
113  if (al.present("-default_tags"))
114  {
115  prev_tag = SENTENCE_START_MARKER;
116  prev_prev_tag = SENTENCE_END_MARKER;
117  last_tag = SENTENCE_END_MARKER;
118  }
119 
120  if (al.present("-prev_tag"))
121  {
122  if (al.present("-default_tags"))
123  cerr << "test_ngram: WARNING : -prev_tag overrides -default_tags"
124  << endl;
125  prev_tag = al.val("-prev_tag");
126  }
127 
128  if (al.present("-prev_prev_tag"))
129  {
130  if (al.present("-default_tags"))
131  cerr << "test_ngram: WARNING : -prev_prev_tag overrides -default_tags" << endl;
132  prev_prev_tag = al.val("-prev_prev_tag");
133  }
134 
135  if (al.present("-last_tag"))
136  {
137  if (al.present("-default_tags"))
138  cerr << "test_ngram: WARNING : -last_tag overrides -default_tags" << endl;
139  last_tag = al.val("-last_tag");
140  }
141 
142  if ( ( (prev_tag=="") || (prev_prev_tag=="") || (last_tag=="") )
143  && ( (prev_tag!="") || (prev_prev_tag!="") || (last_tag!="") ) )
144  {
145  cerr << "test_ngram: ERROR : if any tags are given, ALL must be given" << endl;
146  exit(1);
147  }
148 
149 
150  // script
151  if (al.present("-S"))
152  {
153  script_file = al.val("-S");
154 
155  if(load_StrList(script_file,script) != format_ok)
156  {
157  cerr << "test_ngram: Could not read script from file "
158  << script_file << endl;
159  exit(1);
160  }
161  }
162 
163  if (al.present("-g"))
164  in_file = al.val("-g");
165  else
166  {
167  cerr << "test_ngram: Must give a grammar filename using -g" << endl;
168  exit(1);
169  }
170 
171  // plus any files on command line
172  // except file "-" unless there is no script
173  if(script.head()==NULL)
174  script += files;
175  else
176  for(p=files.head();p!=0;p=p->next())
177  if(files(p) != "-")
178  script.append(files(p));
179 
180  if(script.head() == NULL)
181  {
182  cerr << "test_ngram: No test files given" << endl;
183  exit(1);
184  }
185 
186  if (wordlist_file != "")
187  {
188  // load wordlist
189  if (load_StrList(wordlist_file,wordlist) != format_ok)
190  {
191  cerr << "test_ngram: Could not read wordlist from file " << wordlist_file
192  << endl;
193  exit(1);
194  }
195 
196  // load grammar using wordlist
197  if (ngrammar.load(in_file,wordlist) != format_ok)
198  {
199  cerr << "test_ngram: Failed to load grammar" << endl;
200  exit(1);
201  }
202  }
203  else
204  {
205  if (ngrammar.load(in_file) != format_ok)
206  {
207  cerr << "test_ngram: Failed to load grammar" << endl;
208  exit(1);
209  }
210  }
211 
212  if (!brief)
213  {
214  cout << "Ngram Test Results" << endl;
215  cout << "==================" << endl;
216  }
217 
218  for (p = script.head(); p; p = p->next())
219  {
220  // test each file
221  if (test_stats(ngrammar,
222  script(p),
223  raw_entropy,count,
224  entropy,perplexity,
225  input_format,
226  prev_tag,
227  prev_prev_tag))
228  {
229  total_raw_H += raw_entropy;
230  total_count += count;
231 
232  if(per_file_stats)
233  {
234  if (brief)
235  cout << basename(script(p)) << " \t";
236  else
237  cout << script(p) << endl;
238 
239  if(raw_stats)
240  {
241  if (brief)
242  cout << raw_entropy << " " << count << " ";
243  else
244  {
245  cout << " raw entropy " << raw_entropy << endl;
246  cout << " count " << count << endl;
247  }
248  }
249 
250  if (brief)
251  cout << entropy << " " << perplexity << endl;
252  else
253  {
254  cout << " entropy " << entropy << endl;
255  cout << " perplexity " << perplexity << endl << endl;
256  }
257  }
258  }
259  else
260  {
261  cerr << "test_ngram: WARNING : file '" << script(p)
262  << "' could not be processed" << endl;
263  }
264 
265  }
266  if (total_count > 0)
267  {
268  if (!brief)
269  cout << "Summary for grammar " << in_file << endl;
270  else
271  if (per_file_stats)
272  cout << "summary \t";
273 
274  if(raw_stats)
275  {
276  if (brief)
277  cout << total_raw_H << " " << total_count << " ";
278  else
279  {
280  cout << " raw entropy " << total_raw_H << endl;
281  cout << " count " << total_count << endl;
282  }
283  }
284  if (brief)
285  {
286  cout << total_raw_H / total_count;
287  cout << " " << pow(2.0,total_raw_H / total_count);
288  cout << endl;
289  }
290  else
291  {
292  cout << " entropy " << total_raw_H / total_count << endl;
293  cout << " perplexity " << pow(2.0,total_raw_H / total_count);
294  cout << endl;
295  }
296  }
297  else
298  {
299  cerr << "test_ngram: No data processed" << endl;
300  }
301 
302  // everything went okay
303  return 0;
304 }
305 
306 
307 void override_lib_ops(EST_Option &a_list, EST_Option &al)
308 {
309  (void)a_list;
310  (void)al;
311 }
312 
const int present(const K &rkey) const
Returns true if key is present.
Definition: EST_TKVL.cc:222
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition: EST_TKVL.cc:145
EST_String basename(EST_String full, EST_String ext="")
This acts like the bourne shell basename command. By default, it strips any leading path from a strin...
Definition: util_io.cc:169
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:198
EST_read_status load_StrList(EST_String filename, EST_StrList &l)
Load tokens from a file and return them in a EST_StrList.