/***************************************************************************
                          analyze.cpp  -  description
                             -------------------
    begin                : Sun Jan 7 2001
    copyright            : (C) 2001 by Jan Mueller
    email                : janmueller7@hotmail.com
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/
/***************************************************************************
                          analyze.cpp  -  description                              
                             -------------------                                         
    begin                : Thu Jun 15 2000                                           
    copyright            : (C) 2000 by Jan Mueller                         
    email                : janmueller7@hotmail.com
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   * 
 *                                                                         *
 ***************************************************************************/

#include <strstream.h>
#include <stdlib.h>
#include <qstring.h>
#include <FlexLexer.h>
#include <qstrlist.h>
#include "analyze.h"


void Analyze::addSuffix(const char *s) {
	masks.append( new QRegExp(s, true, true) );
}

bool Analyze::fileMatches(const char *s) {
	for(unsigned int i=0; i<masks.count(); i++) {
		int len=0;
		if(masks.at(i)->match(s, 0, &len)==0 && len==static_cast<int>(strlen(s)) )
			return true;
	}
	return false;
}

Analyze::Analyze():Sql("dbname=kaspaliste"), rebuild(false), max(0) {
	 tokens.resize(49999);
   tokens.setAutoDelete(true);
   oids.setAutoDelete(true);
	 masks.setAutoDelete(true);
}


Analyze::~Analyze(){
}

void Analyze::getOIDs(int type, QString rel, QString field, QList<oidsrec> *l) {
	if(table("getoidtmp"))
		exec("DROP table getoidtmp");

	exec("BEGIN");
	QString s("select oid as o, ");
	s+=field+" as p into table getoidtmp from "; s+=rel+" where "; s+=field+" != 0";
	exec(s);
	s="select o,p from getoidtmp where p not in (select obj from analyzed)";
	exec(s);
	for(int i=0; i<tuples(); i++) {
		oidsrec *t_oid=new oidsrec;
    t_oid->obj=QString(getValue(i, "p")).toUInt();
    t_oid->table=QString(getValue(i, "o")).toUInt();
		t_oid->type=type;
		l->append(t_oid);
	}
	exec("DROP table getoidtmp");
	exec("END");
}

void Analyze::getFileOIDs(int type, QList<oidsrec> *l) {
	if(table("getoidtmp"))
		exec("DROP table getoidtmp");

	exec("BEGIN");
	QString s("select oid as o, file as p, filename as f into table getoidtmp from partdata where file != 0");

	exec(s);
	s="select o, p, f from getoidtmp where p not in (select obj from analyzed)";
	exec(s);
	for(int i=0; i<tuples(); i++) {
		if(fileMatches(getValue(i, "f"))) {
			oidsrec *t_oid=new oidsrec;
  	  t_oid->obj=QString(getValue(i, "p")).toUInt();
    	t_oid->table=QString(getValue(i, "o")).toUInt();
			t_oid->type=type;
			l->append(t_oid);
		} else {
			getValue(i, "p");
			getValue(i, "o");
		}
	}
	exec("DROP table getoidtmp");
	exec("END");
}


void Analyze::readOIDs() {
	cout<<"\nDetermining objects to parse...\n";
	cout.flush();
  // erstmal die OIDs aller LOs
  getOIDs(TYPEAUTHOR, "author", "memo", &oids);
  getOIDs(TYPEPUBL, "publication", "memo", &oids);
  getOIDs(TYPEPART, "part", "memo", &oids);
  getOIDs(TYPENOTE, "note", "memo", &oids);
  getFileOIDs(TYPEFILE, &oids);
}


void Analyze::readDict() {
	cout<<"Reading Dictionary...\n";
	cout.flush();
  exec("BEGIN");
  exec("SELECT oid, word from dictionary");

  for(int i=0; i<tuples(); i++) {
    dictrec *t_oid =new dictrec;
    t_oid->w=QString(getValue(i, "oid")).toUInt();
		t_oid->o=InvalidOid;
    tokens.insert(getValue(i, "word"), t_oid);
  }
  exec("END");
}

char *Analyze::lo2buf(Oid i, long *len) {
  int lfd=0;
  char *buf=0L;

  if(i==InvalidOid) return 0L;
  lfd=lOpen(i, INV_READ);
  lSeek(lfd, 0, SEEK_END);
  *len=lTell(lfd);
  lSeek(lfd, 0, SEEK_SET);
  buf=new char[*len+1];
  ASSERT(buf);
  buf[*len]=0;
  lRead(lfd, buf, *len);
  lClose(lfd);
  return buf;
}

void Analyze::analyzeObj(uint i) {
	QStrList words;

	if(!oids.at(i)) abort();
  exec("BEGIN");
  exec("SELECT obj from analyzed WHERE obj="+QString().setNum(oids.at(i)->obj));
  if(tuples()) { exec("END"); return; }
  exec("END");

  exec("BEGIN");
  long len;
  char *buf=lo2buf(oids.at(i)->obj, &len);
  exec("END");
  istrstream input(buf);
  ostrstream s;
  extern int isHTML;
  isHTML=false;
  yyFlexLexer lexer(&input, &s);
  while(lexer.yylex()!=0);
  s.put(0);
  char *t;
  t=strtok(s.str(), " ");
  int linkcount=0;

  // in t ist der aktuelle token gespeichert

  while(t) {
		if(strlen(t)>1) {
	 	// token existiert nicht? Einfgen!
	  	if(!tokens.find(t)) {
				cout<<"-- New Word: "<<t<<"\n";
				cout.flush();
	    	dictrec *t_oid=new dictrec;
	 	  	exec("BEGIN");
	   		QString sqlstr("INSERT INTO dictionary VALUES('");
	   		sqlstr+=t;
	    	sqlstr+="')";
				exec(sqlstr);
	    	t_oid->w=oidStatus();
				t_oid->o=InvalidOid;
	 	  	exec("END");
	 			tokens.insert(t, t_oid);
			}
	
	  	// token in der token-Tabelle suchen und link speichern
			dictrec *d=0L;
	 		if((d=tokens.find(t)) && d->o!=oids.at(i)->obj) {
				d->o=oids.at(i)->obj;
		    QString s(QString().setNum(d->w));
				s+=" ";s+=QString().setNum(oids.at(i)->obj)+" "+QString().setNum(linkcount)+" ";
				s+=QString().setNum(oids.at(i)->table)+" "+QString().setNum(oids.at(i)->type)+"\n";
				words.append(s);
			}
	 	}
	 	linkcount++;
		t=strtok(0L, " ");
  }
  exec("BEGIN");
  exec("INSERT INTO analyzed VALUES("+QString().setNum(oids.at(i)->obj)+")");
  exec("END");
	exec("BEGIN");
	PGresult *res;
	res=PQexec(getConn(), "copy word from stdin using delimiters ' '");
	if(PQresultStatus(res) != PGRES_COPY_IN) abort();
	unsigned int j;
	for(j=0; j<words.count(); j++) {
		PQputline(getConn(), words.at(j));
	}
	PQputline(getConn(),"\\.");
	PQputline(getConn(),"\n");
	cout<<"---- Wrote "<<j<<" Words ----\n";
	cout.flush();
	int k=PQendcopy(getConn());
	if(k!=0) abort();
	// Postgres-Bug?!?!
	if(PQisBusy(getConn())) abort();
	PQclear(res);
  exec("END");
  s.freeze(0);
  delete buf;
}

void Analyze::deleteIndex() {
	cout<<"\nDropping indexes...\n";
	cout.flush();
	if(index("dictionary_word_index"))
		exec("drop index dictionary_word_index");
	if(index("word_dictionaryno_index"))
		exec("drop index word_dictionaryno_index");
	if(index("word_obj_index"))
		exec("drop index word_obj_index");

	cout<<"\nDeleting old index...\n table word";
	cout.flush();
	exec("delete from word");
	cout<<"\n table dictionary";
	cout.flush();
	exec("delete from dictionary");
	cout<<"\n table analyzed\n";
	cout.flush();
	exec("delete from analyzed");
}

void Analyze::createIndexes() {
	if(!index("dictionary_word_index")) {
		cout<<"\nCreating index on dictionary.word...\n";
		cout.flush();
		exec("CREATE INDEX dictionary_word_index ON dictionary (word)");
	}
	if(!index("word_dictionaryno_index")) {
		cout<<"\nCreating index on word.dictionaryno...\n";
		cout.flush();
		exec("CREATE INDEX word_dictionaryno_index ON word (dictionaryno)");
	}
	if(!index("word_obj_index")) {
		cout<<"\nCreating index on word.obj...\n";
		cout.flush();
		exec("CREATE INDEX word_obj_index ON word (obj)");
	}
}

void Analyze::skipFrequentWords() {
	cout<<"\nSkipping frequent words...\n";
	cout.flush();
	exec("select count(*) from analyzed");
	int count=QString(getValue(0, "count")).toInt();
	int cut=count*max/100;
	if(table("tmpcutwords"))
		exec("drop table tmpcutwords");
	try {
		QString s("select dictionaryno, count(obj) into table tmpcutwords from word group by dictionaryno");
		exec(s);
		s="select word from dictionary where dictionary.oid=tmpcutwords.dictionaryno and tmpcutwords.count>";
		s+=QString().setNum(cut);
		exec(s);
		for(int i=0; i<tuples(); i++)
			cout<<getValue(i, "word")<<"\n";
		cout<<"Deleting "<<tuples()<<" words...\n";
		cout.flush();
		s="delete from word where word.dictionaryno=tmpcutwords.dictionaryno and tmpcutwords.count>";
		s+=QString().setNum(cut);
		exec(s);
		s="delete from dictionary where dictionary.oid=tmpcutwords.dictionaryno and tmpcutwords.count>";
		s+=QString().setNum(cut);
		exec(s);
			exec("drop table tmpcutwords");
	} catch(...) {
			if(table("tmpcutwords"))
				exec("drop table tmpcutwords");
			throw;
	}
}


void Analyze::work() {
	if(rebuild) deleteIndex();
	readOIDs();
	if(term) return;
	readDict();
	if(term) return;
  // Jedes LO ffnen, doofe Zeichen killen und in tokens zerlegen...
  for(uint i=0; i<oids.count(); i++) {
		cout<<"\n===== Parsing object "<<i+1<<" of "<<oids.count()<<" objects.=====\n";
		cout.flush();
		analyzeObj(i);
  	if(term) return;
	}
	oids.clear();
	tokens.clear();
	createIndexes();

	if(max>0) {
	cout<<"\nVacuum database...\n";
	cout.flush();
	exec("vacuum analyze dictionary");
	exec("vacuum analyze word");
	skipFrequentWords();
	}
	
	cout<<"\nVacuum database...\n";
	cout.flush();
	exec("vacuum analyze");
}



