如何使用Lucene对html文件进行索引-web前端-IT落伍者

我修改了lucene的demo包的IndexHTML类使其可以被其他Java类调用

IndexHTML类

import orgapacheluceneanalysisstandardStandardAnalyzer;

import orgapachelucenedocumentDocument;

import orgapacheluceneindexIndexReader;

import orgapacheluceneindexIndexWriter;

import orgapacheluceneindexTerm;

import orgapacheluceneindexTermEnum;

import javaioFile;import javautilDate;

import javautilArrays;

//还需调用demo的其他类

import orgapachelucenedemo;

/**

* Create html file index for searching

* @author tyrone

*/public class IndexHTML { private String DocsPath=null;

/**

* the path for index file;

*/ private String IndexFilePath=null;

/**

* true during deletion pass

*/ private boolean deleting = false;

/**

* existing index

*/ private IndexReader reader;

/**

* new index being built

*/ private IndexWriter writer;

/**

* document id iterator

*/ private TermEnum uidIter;

private void indexDocs(File file)throws Exception {

if (fileisDirectory())

{

// if a directory String[] files = filelist();

// list its files Arrayssort(files);

// sort the files for (int i = ; i < fileslength;

i++) // recursively index themthisindexDocs(new File(file files[i]));

} else if (filegetPath()endsWith(l) || // l filesfilegetPath()endsWith() || // filesfilegetPath()endsWith(txt)) { // index txt filesif (thisuidIter != null) {String uid = HTMLDocumentuid(file);

// construct uid for doc

while (uidIterterm() != null && uidIterterm()field() == uid &&

uidIterterm()text(pareTo(uid) <) {

if (deleting) {

// delete stale docs

Systemoutprintln(deleting +

HTMLDocumentuidurl(uidIterterm()text()));

readerdelete(uidIterterm());

}

uidIternext();

}

if (uidIterterm() != null && uidIterterm()field() == uid &&

uidIterterm()text(pareTo(uid) == ) {

uidIternext();

// keep matching docs

} else if (!deleting) {

// add new docs

Document doc = HTMLDocumentDocument(file);

Systemoutprintln(adding + docget(url));

writeraddDocument(doc);

}

} else { // creating a new index

Document doc = HTMLDocumentDocument(file);

Systemoutprintln(adding + docget(url));

writeraddDocument(doc);

// add docs unconditionally

}

}return;

}

/**

* Walk directory hierarchy in uid order while keeping uid iterator from

* existing index in syncMismatches indicate one of:

* (a) old documents to be deleted;

* (b) unchanged documents to be left alone;

* or (c) new documents to be indexed

*/ private void indexDocs(File file String index boolean create)

throws Exception {

if (!create) {

// incrementally update

reader = IndexReaderopen(index);

// open existing index

uidIter = readerterms(new Term(uid ));

// init uid iterator

thisindexDocs(file);

if (deleting) {

// delete rest of stale docs

while (uidIterterm() != null && uidIterterm()field() == uid) {

Systemoutprintln(deleting +

HTMLDocumentuidurl(uidIterterm()text()));

readerdelete(uidIterterm());

uidIternext();

}

deleting = false;

}

uidIterclose();

// close uid iterator

readerclose();

// close existing index

} else

// dont have exisiting

thisindexDocs(file);

}

/**

* if create=true create a new index else refresh old index

* @param create

*/ public void run(boolean create)

{

try {

String index = index;

File root = null;

if (thisIndexFilePath!=null)

{

// index file path

index = thisIndexFilePath;

}

if (thisDocsPath==null){

Systemoutprintln(root directory is not set);

return;

}

root = new File(thisDocsPath);

Date start = new Date();

/**

* not create then maintenance

if (!create) {

// delete stale docs

thisdeleting = true;

thisindexDocs(root index create);

}

writer = new IndexWriter(index new StandardAnalyzer() create);

writermaxFieldLength = ;

thisindexDocs(root index create);

// add new docs

Systemoutprintln(Optimizing index);

writeroptimize();

writerclose();

Date end = new Date();

Systemoutprint(endgetTime() startgetTime());

Systemoutprintln( total milliseconds);

} catch (Exception e) {

Systemoutprintln( caught a + egetClass() +

\n with message: + egetMessage());

}

return;

}

/**

* @return Returns the IndexFilePath

*/ public String getIndexFilePath() {return IndexFilePath;

}

/**

* @param IndexFilePath The IndexFilePath to set

*/ public void setIndexFilePath(String property) {thisIndexFilePath = property;

}

/**

* @return Returns the DocsPath

*/ public String getDocsPath() {return DocsPath;

}

/**

* @param DocsPath The DocsPath to set

*/ public void setDocsPath(String property) {thisDocsPath = property;

}

/**

* test

* @param args

*/ public static void main(String[] args){IndexHTML ih=new IndexHTML();

ihsetDocsPath(D:\\MyProject\\colimas\\clmsdoc\\html);

ihsetIndexFilePath(D:\\MyProject\\colimas\\index);ihrun(true); }}

运行后生成个文件_icfsdeletablesegments

搜索文件类

* Created on //

* TODO To change the template for this generated file go to

* Window Preferences Java Code Style Code Templates

*/package limassearchquery;

/** * @author tyrone * * TODO To change the template for this generated type comment go to

* Window Preferences Java Code Style Code Templates

*/public class HitsHTMLDoc {private String Title;

priva