//----------------------------------------------------------
// parser:  generalized CYK for stochastic parsing
// Copyright (C) 2002  Rafael C. Carrasco
// Warnings: 
//   parameter entities are replaced and simplified for each model group
//   not checked with namespaces
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//----------------------------------------------------------

using namespace std;

#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <map>
#include <vector>
#include <set>
#ifndef N
#define N 4
#endif

//------------------------------------------------------------------------
template <class T> 
ostream& 
operator<< (ostream& os, const vector<T>& text) {  
  os << '(';
  if ( text.size() > 0 )
    os << text[0];
  for (unsigned k = 1; k < text.size(); ++k) 
    os << ' ' << text[k];
  os << ')';
  return os;
}

//------------------------------------------------------------------------
template <class T> 
ostream& 
operator<< (ostream& os, const set<T>& s) {
  typename set<T>::const_iterator it = s.begin();
  os << '{';
  if ( it != s.end() ) { 
    os << *it;
    while ( ++it != s.end() ) 
      os << ',' << *it;
  }
  os << '}';
  return os;
}

//------------------------------------------------
void 
error (string error_message) { 
  cerr << "Error: " << error_message << endl; 
  exit(1); 
}

//-----------------------------------------------------------------------
// Collection is an indexed set
//-----------------------------------------------------------------------
template <class T> 
class Collection {
  map <const T, uint> index;
  vector <const T*> element;
public:
  size_t  size (void)  const { return element.size(); }
  const T& operator[] (uint n) const { return  *element[n]; }
  uint operator[] (const T& t) {
    pair<typename map<const T, uint>::iterator, bool> r; 

    r = index.insert( pair<const T, uint>(t, index.size()) );
    if ( r.second ) 
      element.push_back( &r.first->first );
    
    return (r.first->second);
  }
};

//----------------------------------------------------------------------
typedef vector<uint> Text;     // text (sequence of words)
typedef set<uint> USet;     // set of unsigned


//-------------------------------------------------------------------
class Grammar {

private:
  Collection<string> lex;          // lexicon  (V + T)
  USet V;                          // grammar variables
  map<Text, USet> R;               // rule set
  map<uint, USet> R1;              // unary productions 
  set<Text> P;                     // rules prefixes
  vector< map<Text, double> > p;   // rule probs
  vector< map<uint, double> > p1;  // 1-prod probs  

  bool is_terminal (uint m)  { return V.find(m) == V.end(); }  
  Text tokenize    (const string&);
  uint add_var     (const string&);
  void add_rule    (const string&);
  void normalize   (void);
  void clousure    (void);   

public:
  double parse  (const string& S, const string& line);

  friend istream& operator>> (istream& is, Grammar& G);
  friend ostream& operator<< (ostream& os, Grammar& G); 
 
};


Text 
Grammar::tokenize (const string& sentence) {
  Text text;
  string word;
  stringstream buff;

  buff << sentence;
  while ( buff >> word ) 
    text.push_back( lex[word] );
  return text;
}


istream& 
operator>> ( istream& is, Grammar& G ) {
  string rule;

  while ( getline(is, rule) )
    G.add_rule(rule); 
  G.normalize(); 
  //  G.clousure();
  return is;
}


ostream& 
operator<< ( ostream& os, Grammar& G ) {
  set<uint>::iterator i;
  map<Text, double>::iterator m;
  map<uint, double>::iterator n;

  for ( i = G.V.begin(); i != G.V.end(); ++i ) {
    for ( m = G.p[*i].begin(); m != G.p[*i].end(); ++m )
      os << *i << ' ' << m->first << ' ' << m->second <<endl;
    for ( n = G.p1[*i].begin(); n != G.p1[*i].end(); ++n )
      os << *i << ' ' << n->first << ' ' << n->second <<endl;
  }

  return os;
}

uint 
Grammar::add_var (const string& var) {
  uint q = lex[var];

  V.insert(q);
  if ( min( p.size(), p1.size() ) < q + 1 ) {
    p.resize(q + 1);
    p1.resize(q + 1);
  }
  
  return q;
}


void 
Grammar::add_rule (const string& rule) {
  stringstream buff;
  string word;
  uint left, right1;
  double n;
  Text right;
 
  buff << rule;
  buff >> n >> word;
  if (!buff) 
    error("wrong syntax in grammar file");

  left = add_var(word);   
 
  buff >> word; 
  if (!buff) 
    error("empty rhs in grammar file");   
  right.push_back( lex[word] );

  while ( buff >> word ) {
    P.insert(right);                   // right is a strict rule prefix
    right.push_back( lex[word] );
  }
  
  if ( right.size() == 1 ) {         // unary
    right1 = lex[word];
    if (left != right1) {
      if ( R1[right1].insert(left).second ) 
	p1[left][right1] = n;
      else
	error(string("rule ") + buff.str() + " twice in grammar"); 
    } else {                        
      cerr<<"Warning: trivial production "
	  <<lex[left]<<' '<<lex[left]<<" removed"<<endl;
    }
  } else {                           // n-ary
    if (  R[right].insert( left ).second ) 
      p[left][right] = n;
    else
      error(string("rule ") + buff.str() + " twice in grammar");
  } 
}


void 
Grammar::normalize (void) {
  uint q;
  double acc;
  set<uint>::iterator i;
  map<Text, double>::iterator m; 
  map<uint, double>::iterator n;

  for ( i = V.begin(); i != V.end(); ++i ) {
    q = *i;
    acc = 0;
    for ( m = p[q].begin(); m != p[q].end(); ++m )
      acc += m->second;
    for ( n = p1[q].begin(); n != p1[q].end(); ++n )
      acc += n->second;
    for ( m = p[q].begin(); m != p[q].end(); ++m )
      m->second /= acc;
    for ( n = p1[q].begin(); n != p1[q].end(); ++n )
      n->second /= acc;
  }
}

void 
Grammar::clousure (void) {
  set<uint> added[2];              // added vars
  uint right, left, numit = 0;     
  map<uint, USet>::iterator it;    // iterators
  USet::iterator jt, kt;
 
  for ( it = R1.begin(); it != R1.end(); ++it ) {
    added[0].clear();                                           
    for ( kt = it->second.begin(); kt != it->second.end(); ++it ) 
      added[0].insert(*kt);
 
    right = it->first;    
    while ( added[numit%2].size() > 0 ) {
      added[1-numit%2].clear();
      for ( kt = added[numit%2].begin(); kt != added[numit%2].end(); ++kt ) {
        left = *kt;
        if ( R1.find(left) != R1.end() ) { 
         for ( jt = R1[left].begin(); jt != R1[left].end(); ++jt ) {
           if ( *jt == right ) { 
	     cerr << "Warning: " << lex[right]
		  <<" self-derivation" << endl;
	   } else if (  p1[*jt][left] * p1[left][right] > p1[*jt][right] ) { 
	     R1[right].insert(*jt); 
	     added[1-numit%2].insert(*jt);
	     p1[*jt][right] = p1[*jt][left] * p1[left][right];
	   }
	 }
	}
      }
      ++numit;
    }
  }  
}

struct Result {
  double prob;
  string tree;
  Result (void) : prob(0) {;}
  Result (double _prob, string _tree) : prob(_prob), tree(_tree) {;}
};


double 
Grammar::parse (const string& S, const string& line) {
  Text const sentence = tokenize(line);
  uint const LEN = 75, len = sentence.size(), start = lex[S];
  uint  i, j, k;  // aux indices
  Text text;
  double prtext;
  map<uint, Result> chart1[LEN][LEN];       // chart1: lhs 
  map<Text, Result> chart2[LEN][LEN];       // chart2: rhs strict prefixes
  map<uint, Result>::iterator it;           // iterators
  map<Text, Result>::iterator jt;
  USet::iterator kt;
  
  if ( len > LEN ) 
    error(string("recompile with larger LEN"));

  // chart1 initialization 
  for (j = 0; j < len; ++j ) {             
    chart1[0][j][sentence[j]] = Result(1, lex[sentence[j]]);  // terminal
    if ( R1.find(sentence[j]) != R1.end() ) {   // unit-productions
      for ( kt = R1[sentence[j]].begin(); kt != R1[sentence[j]].end(); ++kt ) {
        chart1[0][j][*kt] = Result(p1[*kt][sentence[j]], '(' + 
				   lex[*kt] + ' ' + lex[sentence[j]] + ')');
      }
    }
  }

  // build charts
  for (i = 0; i < len; ++i) { 
    for (j = 0; j < len-i; ++j) {
      for (k = 0; k < i; ++k) {  
	for (jt = chart2[k][j].begin(); jt != chart2[k][j].end(); ++jt ) {  
          text = jt->first; 
	  for ( it = chart1[i-k-1][j+k+1].begin(); it != chart1[i-k-1][j+k+1].end(); ++it ) {
            text.push_back(it->first); 
	    // warning: new text may be both a strict prefix and a rhs
	    prtext = jt->second.prob * it->second.prob;
            if ( P.find(text) != P.end() && prtext > chart2[i][j][text].prob ) 
	      chart2[i][j][text] = Result(prtext, jt->second.tree + ' ' +
					  it->second.tree);  
	    
	    if ( R.find(text) != R.end() ) {  
	      USet* uset = & R.find(text)->second;
              for ( kt = uset->begin(); kt != uset->end(); ++kt ) {
		//		 prtmp = p[*kt][text] * jt->second.prob * it->second.prob;
		 if ( p[*kt][text] * prtext > chart1[i][j][*kt].prob ) { 
		   chart1[i][j][*kt] = Result(p[*kt][text] * prtext, '(' + lex[*kt] + ' ' +
					      jt->second.tree + ' ' + 
					      it->second.tree + ')');
		 }	 
	      }
	    }
            text.pop_back();
	  }
	}
      }
  
      // unit-production chains
      for (uint n = 0; n < N; ++n) {
      for (it = chart1[i][j].begin(); it != chart1[i][j].end(); ++it ) { 
	if ( R1.find(it->first) != R1.end() ) {   
	  for ( kt = R1[it->first].begin(); kt != R1[it->first].end(); ++kt ) {
	    prtext = p1[*kt][it->first] * it->second.prob;
	    if ( prtext > chart1[i][j][*kt].prob ) {
	      chart1[i][j][*kt] = Result(prtext, '(' + lex[*kt] + ' ' + 
					 chart1[i][j][it->first].tree + ')');
	    }
	  }
	}	
      }
      }
      // update chart2
      for ( it =  chart1[i][j].begin(); it != chart1[i][j].end(); ++it ) { 
        text = Text(1, it->first);
	if ( P.find(text) != P.end() ) {
	  if ( chart1[i][j][it->first].prob > chart2[i][j][text].prob ) {
	    chart2[i][j][text] = Result( it->second.prob, chart1[i][j][it->first].tree ); 
	  } 
	} 
      }
    }
  }
 
  Result res = chart1[len-1][0][start];
  cout << res.tree << endl;
 
  return res.prob;
} 


//---------------------------------------------------------------
int 
main( int argc, char** argv ) {
 ifstream infile;
 Grammar G;
 string S = "S1", line;

 if ( argc < 2) {
   cerr << "usage: parser grammar_file [-S start] < sentence "
	<< endl;
   exit(1);
 }
 for ( int k = 1; k < argc; ++k ) {
   if (string(argv[k]) == "-S") {
     if ( ++k == argc ) 
       error("missing argument");
     else
       S = argv[k]; 
   } else {
     infile.open(argv[k]);
     if ( !infile ) 
       error(string("cannot open") + argv[k]);
   }
 }

 infile >> G;   

 while ( getline(cin, line) ) 
   cerr << G.parse(S, line) << endl;
 
 return 0;
}


