Edinburgh Speech Tools  2.1-release
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
ngram_build_main.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1995,1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Authors: Simon King */
34 /* Date : July 1995 */
35 /*-----------------------------------------------------------------------*/
36 /* EST_Ngrammar build program */
37 /* */
38 /*=======================================================================*/
39 
40 #include <cstdlib>
41 #include "EST.h"
42 #include "EST_Ngrammar.h"
43 #include "EST_Pathname.h"
44 
45 
46 int main(int argc, char **argv)
47 {
48  int order;
49  EST_StrList files;
50  EST_Option al, op;
51  EST_String wordlist_file,wordlist_file2, out_file, format;
52  EST_String prev_tag(""), prev_prev_tag(""), last_tag("");
53  EST_String input_format(""), oov_mode(""), oov_marker("");
54  EST_Ngrammar::representation_t representation =
55  EST_Ngrammar::dense;
56 
57  EST_StrList wordlist,wordlist2;
58  EST_Ngrammar ngrammar;
59  bool trace=false;
60  double floor=0.0;
61 
62  parse_command_line
63  (argc, argv,
64  EST_String("[input file0] [input file1] ... -o [output file]\n")+
65  "-w <ifile> filename containing word list (required)\n"+
66  "-p <ifile> filename containing predictee word list\n"+
67  " (default is to use wordlist given by -w)\n"+
68  "-order <int> order, 1=unigram, 2=bigram etc. (default 2)\n"+
69  "-smooth <int> Good-Turing smooth the grammar up to the\n"+
70  " given frequency\n"+
71  "-o <ofile> Output file for constructed ngram\n"+
72  "\n"
73  "-input_format <string>\n"+
74  " format of input data (default sentence_per_line)\n"+
75  " may be sentence_per_file, ngram_per_line.\n"+
76  "-otype <string> format of output file, one of cstr_ascii\n"+
77  " cstr_bin or htk_ascii\n"+
78  "-sparse build ngram in sparse representation\n"+
79  "-dense build ngram in dense representation (default)\n"+
80  "-backoff <int>\n"+
81  " build backoff ngram (requires -smooth)\n"+
82  "-floor <double>\n"+
83  " frequency floor value used with some ngrams\n"+
84  "-freqsmooth <int>\n"+
85  " build frequency backed off smoothed ngram, this\n"+
86  " requires -smooth option\n"+
87  "-trace give verbose outout about build process\n"+
88  "-save_compressed save ngram in gzipped format\n"+
89  "-oov_mode <string>\n"+
90  " what to do about out-of-vocabulary words,\n"+
91  " one of skip_ngram, skip_sentence (default),\n"+
92  " skip_file, or use_oov_marker\n"+
93  "-oov_marker <string>\n"+
94  " special word for oov words (default "+OOV_MARKER+")\n"+
95  " (use in conjunction with '-oov_mode use_oov_marker'\n"+
96  "\n"+
97  "Pseudo-words :\n"+
98  "-prev_tag <string>\n"+
99  " tag before sentence start\n"+
100  "-prev_prev_tag <string>\n"+
101  " all words before 'prev_tag'\n"+
102  "-last_tag <string>\n"+
103  " after sentence end\n"+
104  "-default_tags use default tags of "+SENTENCE_START_MARKER+
105  ","+SENTENCE_END_MARKER+" and "+SENTENCE_END_MARKER+"\n"+
106  " respectively\n",
107  files, al);
108 
109  if (al.present("-input_format"))
110  input_format = al.val("-input_format");
111  else
112  input_format = "sentence_per_line";
113 
114  if (al.present("-oov_mode"))
115  oov_mode = al.val("-oov_mode");
116  else
117  oov_mode = "skip_sentence";
118 
119 
120  if(al.present("-oov_marker"))
121  {
122  if(oov_mode != "use_oov_marker")
123  {
124  cerr << "Error : can only use -oov_marker with '-oov_mode use_oov_marker'" << endl;
125  exit(1);
126  }
127  else
128  oov_marker = al.val("-oov_marker");
129 
130  // should check oov marker is/isn't (?) in vocab
131  // ......
132  }
133 
134  if( (oov_mode != "skip_ngram") &&
135  (oov_mode != "skip_sentence") &&
136  (oov_mode != "skip_file") &&
137  (oov_mode != "use_oov_marker") )
138  {
139  cerr << oov_mode << " is not a valid oov_mode !" << endl;
140  exit(1);
141  }
142 
143  if (al.present("-w"))
144  wordlist_file = al.val("-w");
145  else{
146  cerr << "build_ngram: Must specify a wordlist with -w" << endl;
147  exit(1);
148  }
149 
150  if (load_StrList(wordlist_file,wordlist) != format_ok)
151  {
152  cerr << "build_ngram: Could not read wordlist from file "
153  << wordlist_file << endl;
154  exit(1);
155  }
156 
157 
158  if (al.present("-p"))
159  {
160 
161  if(input_format != "ngram_per_line")
162  {
163  cerr << "Can't have differering predictor/predictee lists unless data is in ngram_per_line format !" << endl;
164  exit(1);
165  }
166 
167  wordlist_file2 = al.val("-p");
168  if (load_StrList(wordlist_file2,wordlist2) != format_ok)
169  {
170  cerr << "build_ngram: Could not read predictee list from file "
171  << wordlist_file2 << endl;
172  exit(1);
173  }
174  }
175 
176  if (al.present("-trace"))
177  trace=true;
178 
179  if (al.present("-o"))
180  out_file = al.val("-o");
181  else
182  out_file = "-";
183 
184  if (al.present("-default_tags"))
185  {
186  prev_tag = SENTENCE_START_MARKER;
187  prev_prev_tag = SENTENCE_END_MARKER;
188  last_tag = SENTENCE_END_MARKER;
189 
190  wordlist.append(SENTENCE_START_MARKER);
191  wordlist.append(SENTENCE_END_MARKER);
192 
193  if (al.present("-p"))
194  {
195  wordlist2.append(SENTENCE_START_MARKER);
196  wordlist2.append(SENTENCE_END_MARKER);
197  }
198  }
199 
200  if (al.present("-prev_tag"))
201  {
202  if (al.present("-default_tags"))
203  cerr << "build_ngram: WARNING : -prev_tag overrides -default_tags"
204  << endl;
205  prev_tag = al.val("-prev_tag");
206  }
207 
208  if (al.present("-prev_prev_tag"))
209  {
210  if (al.present("-default_tags"))
211  cerr << "build_ngram: WARNING : -prev_prev_tag overrides -default_tags"
212  << endl;
213  prev_prev_tag = al.val("-prev_prev_tag");
214  }
215 
216  if (al.present("-last_tag"))
217  {
218  if (al.present("-default_tags"))
219  cerr << "build_ngram: WARNING : -last_tag overrides -default_tags"
220  << endl;
221  last_tag = al.val("-last_tag");
222  }
223 
224  if ( ( (prev_tag=="") || (prev_prev_tag=="") || (last_tag=="") )
225  && ( (prev_tag!="") || (prev_prev_tag!="") || (last_tag!="") ) )
226  {
227  cerr << "build_ngram: ERROR : if any tags are given, ALL must be given"
228  << endl;
229  exit(1);
230  }
231 
232  if (al.present("-order"))
233  order = al.ival("-order");
234  else
235  {
236  cerr << "build_ngram: WARNING : No order specified with -order : defaulting to bigram"
237  << endl;
238  order = 2;
239  }
240 
241  if (al.present("-otype"))
242  format = al.val("-otype");
243  else
244  format = "";
245 
246  if (al.present("-floor"))
247  floor = al.dval("-floor");
248  else
249  floor = 0.0;
250 
251  if (al.present("-backoff"))
252  if (!al.present("-smooth"))
253  {
254  cerr << "build_ngram: backoff requires smooth value" << endl;
255  exit(-1);
256  }
257  if (al.present("-freqsmooth"))
258  if (!al.present("-smooth"))
259  {
260  cerr << "build_ngram: frequency smooth requires smooth value"
261  << endl;
262  exit(-1);
263  }
264 
265  if (al.present("-dense"))
266  representation = EST_Ngrammar::dense;
267  else if (al.present("-sparse"))
268  {
269  cerr << "build_ngram: Sorry, sparse representation is not yet available " << endl;
270  exit(1);
271  representation = EST_Ngrammar::sparse;
272  }
273  else if (al.present("-backoff"))
274  representation = EST_Ngrammar::backoff;
275  else
276  cerr << "build_ngram: Defaulting to dense representation" << endl;
277 
278  if (al.present("-p"))
279  {
280  if (!ngrammar.init(order,representation,wordlist,wordlist2))
281  {
282  cerr << "build_ngram: Failed to initialise " << order << "-gram" << endl;
283  exit(1);
284  }
285  }
286  else
287  {
288  if (!ngrammar.init(order,representation,wordlist))
289  {
290  cerr << "build_ngram: Failed to initialise " << order << "-gram" << endl;
291  exit(1);
292  }
293  }
294 
295 
296  if ( al.present("-backoff") )
297  {
298  if (!ngrammar.build(files,prev_tag,prev_prev_tag,
299  last_tag,input_format,oov_mode,
300  al.ival("-backoff"),al.ival("-smooth")))
301  {
302  cerr << "build_ngram: Failed to build backoff " << order
303  << "-gram" << endl;
304  exit(1);
305  }
306  else if (trace)
307  cerr << "build_ngram: Built backoff " << order <<
308  "-gram" << endl;
309  }
310  else
311  {
312  if (!ngrammar.build(files,prev_tag,prev_prev_tag,
313  last_tag,input_format,oov_mode))
314  {
315  cerr << "build_ngram: Failed to build " << order << "-gram" << endl;
316  exit(1);
317  }
318  else
319  if(trace)
320  cerr << "build_ngram: Built " << order << "-gram" << endl;
321  }
322 
323 
324  // Posit processing functions
325  if (al.present("-freqsmooth"))
326  {
327  Ngram_freqsmooth(ngrammar,al.ival("-smooth"),al.ival("-freqsmooth"));
328  }
329  else if (al.present("-smooth") && !al.present("-backoff"))
330  {
331  int smoothcount = atoi(al.val("-smooth"));
332  if(!Good_Turing_smooth(ngrammar,smoothcount))
333  {
334  cerr << "build_ngram: Failed to smooth " << order << "-gram" << endl;
335  exit(1);
336  }
337  else
338  if(trace)
339  cerr << "build_ngram: Good Turing smoothed " << order << "-gram" << endl;
340 
341  }
342 
343  // save
344  if (al.present("-save_compressed"))
345  {
346  EST_String tmp_file = make_tmp_filename();
347  if (ngrammar.save(tmp_file,format,trace,floor) == write_ok)
348  {
349  EST_String prog_name;
350  EST_Pathname tmp(out_file);
351  if (tmp.extension() == GZIP_FILENAME_EXTENSION)
352  prog_name = "gzip --stdout";
353  else if (tmp.extension() == COMPRESS_FILENAME_EXTENSION)
354  prog_name = "compress -c";
355  else // default
356  {
357  prog_name = "gzip --stdout";
358  if(out_file != "-")
359  out_file = out_file + "." + GZIP_FILENAME_EXTENSION;
360  }
361 
362  if (trace)
363  cerr << "build_ngram: Compressing with '" << prog_name << "'" << endl;
364 
365  // now compress
366  if(compress_file(tmp_file,out_file,prog_name) != 0)
367  {
368  cerr << "build_ngram: Failed to compress to file "
369  << out_file << endl;
370  (void)delete_file(tmp_file);
371  exit(1);
372  }
373 
374  (void)delete_file(tmp_file);
375 
376  if(trace)
377  cerr << "build_ngram: Saved in compressed " << format
378  << " format to " << out_file << endl;
379  }
380  else
381  {
382  cerr << "build_ngram: Failed to write temporary file "
383  << tmp_file << endl;
384  exit(1);
385  }
386 
387 
388  }
389  else
390  {
391  if (ngrammar.save(out_file,format,trace,floor) == write_ok)
392  {
393  if(trace)
394  cerr << "build_ngram: Saved in " << format
395  << " format to " << out_file << endl;
396  }
397  else
398  {
399  cerr << "build_ngram: Failed to save " << format << " format data to "
400  << out_file << endl;
401  exit(1);
402  }
403  }
404 
405 
406  // everything went okay
407  return 0;
408 }
int ival(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:76
double dval(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:109
EST_String make_tmp_filename()
Make a unique temporary filename.
Definition: util_io.cc:54
const int present(const K &rkey) const
Returns true if key is present.
Definition: EST_TKVL.cc:222
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition: EST_TKVL.cc:145
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:198
int delete_file(const EST_String &filename)
OS independent way of removing a file.
Definition: EST_io_aux.h:81
int compress_file(const EST_String &filename, const EST_String &new_filename, const EST_String &prog_name)
compress file by calling program prog, writing result to new_filename
Definition: util_io.cc:231
EST_read_status load_StrList(EST_String filename, EST_StrList &l)
Load tokens from a file and return them in a EST_StrList.