package comtag;
import javaioBufferedInputStream;
import javaioBufferedOutputStream;
import javaioBufferedReader;
import javaioByteArrayInputStream;
import javaioDataOutputStream;
import javaioFile;
import javaioFileInputStream;
import javaioFileOutputStream;
import javaioFileWriter;
import javaioIOException;
import javaioInputStream;
import javaioInputStreamReader;
import javaioOutputStream;
import javaioReader;
import MalformedURLException;
import URL;
import javautil*;
import lparserParser;
import lparserTag;
import lparserfiltersTagNameFilter;
import lparserlexerLexer;
import lparserlexerPage;
import lparserutilDefaultParserFeedback;
import lparserutilNodeList;
import lparserutilParserException;
import toptracktoolsJQuery;
import javaxactivationDataHandler;
import javaxactivationDataSource;
import javaxactivationMimetypesFileTypeMap;
import javaxmailMessage;
import javaxmailMessagingException;
import javaxmailMultipart;
import javaxmailSession;
import javaxmailinternetInternetAddress;
import javaxmailinternetMimeBodyPart;
import javaxmailinternetMimeMessage;
import javaxmailinternetMimeMultipart;
import javaxmailinternetMimePartDataSource;
/**
* mht文件解析类
* @author dl
*/
public class HtmlMHTCompiler {
private URL strWeb = null; /**网页地址*/
private String strText = null; /**网页文本内容*/
private String strFileName = null; /**本地文件名*/
private String strEncoding = null; /**网页编码*/
//mht格式附加信息
private String from = ;
private String to;
private String subject = mht compile;
private String cc;
private String bcc;
private String smtp = localhost;
public static void main(String[] args) {
String strUrl = ;
String strEncoding = utf;
String strText = JQuerygetHtmlText(strUrl strEncoding null);
if (strText == null)
return;
HtmlMHTCompiler ht = new HtmlMHTCompiler(strText strUrl strEncoding testmht);
pile();
//HtmlMHTCompilermhthtml(testmht l);
}
/**
*<br>方法说明初始化
*<br>输入参数strText 网页文本内容; strUrl 网页地址; strEncoding 网页编码; strFileName 本地文件名
*<br>返回类型
*/
public HtmlMHTCompiler(String strText String strUrl String strEncoding String strFileName) {
// TODO Autogenerated constructor stub
try {
strWeb = new URL(strUrl);
} catch (MalformedURLException e) {
// TODO Autogenerated catch block
eprintStackTrace();
return;
}
thisstrText = strText;
thisstrEncoding = strEncoding;
thisstrFileName = strFileName;
}
/**
*<br>方法说明执行下载操作
*<br>输入参数
*<br>返回类型
*/
public boolean compile() {
if (strWeb == null || strText == null || strFileName == null || strEncoding == null)
return false;
HashMap urlMap = new HashMap();
NodeList nodes = new NodeList();
try {
Parser parser = createParser(strText);
parsersetEncoding(strEncoding);
nodes = parserparse(null);
} catch (ParserException e) {
// TODO Autogenerated catch block
eprintStackTrace();
}
extractAllScriptNodes(nodes);
ArrayList urlScriptList = extractAllScriptNodes(nodes urlMap);
ArrayList urlImageList = extractAllImageNodes(nodes urlMap);
for (Iterator iter = urlMapentrySet(erator(); iterhasNext();) {
MapEntry entry = (MapEntry) iternext();
String key = (String)entrygetKey();
String val = (String)entrygetValue();
strText = JHtmlClearreplace(strText val key);
}
try {
createMhtArchive(strText urlScriptList urlImageList);
} catch (Exception e) {
// TODO Autogenerated catch block
eprintStackTrace();
return false;
}
return true;
}
/**
*<br>方法说明建立HTML parser
*<br>输入参数inputHTML 网页文本内容
*<br>返回类型HTML parser
*/
private Parser createParser(String inputHTML) {
// TODO Autogenerated method stub
Lexer mLexer = new Lexer(new Page(inputHTML));
return new Parser(mLexer new DefaultParserFeedback(DefaultParserFeedbackQUIET));
}
/**
*<br>方法说明抽取基础URL地址
*<br>输入参数nodes 网页标签集合
*<br>返回类型
*/
private void extractAllScriptNodes(NodeList nodes) {
NodeList filtered = nodesextractAllNodesThatMatch(new TagNameFilter(
BASE) true);
if (filtered != null && filteredsize() > ) {
Tag tag = (Tag) filteredelementAt();
String href = taggetAttribute(href);
if (href != null && hreflength() > ) {
try {
strWeb = new URL(href);
} catch (MalformedURLException e) {
// TODO Autogenerated catch block
eprintStackTrace();
}
}
}
}
/**
*<br>方法说明抽取网页包含的cssjs链接
*<br>输入参数nodes 网页标签集合; urlMap 已存在的url集合
*<br>返回类型cssjs链接的集合
*/
private ArrayList extractAllScriptNodes(NodeList nodes HashMap urlMap) {
ArrayList urlList = new ArrayList();
NodeList filtered = nodesextractAllNodesThatMatch(new TagNameFilter(script) true);
for (int i = ; i < filteredsize(); i++) {
Tag tag = (Tag) filteredelementAt(i);
String src = taggetAttribute(src);
// Handle external css files url
if (src != null && srclength() > ) {
String innerURL = src;
String absoluteURL = makeAbsoluteURL(strWeb innerURL);
if (absoluteURL != null && !ntainsKey(absoluteURL)) {
urlMapput(absoluteURL innerURL);
ArrayList urlInfo = new ArrayList();
urlInfoadd(innerURL);
urlInfoadd(absoluteURL);
urlListadd(urlInfo);
}
tagsetAttribute(src absoluteURL);
}
}
filtered = nodesextractAllNodesThatMatch(new TagNameFilter(link) true);
for (int i = ; i < filteredsize(); i++) {
Tag tag = (Tag) filteredelementAt(i);
String type = (taggetAttribute(type));
String rel = (taggetAttribute(rel));
String href = taggetAttribute(href);
boolean isCssFile = false;
if (rel != null) {
isCssFile = relindexOf(stylesheet) != ;
} else if (type != null) {
isCssFile |= typeindexOf(text/css) != ;
}
// Handle external css files url
if (isCssFile && href != null && hreflength() > ) {
String innerURL = href;
String absoluteURL = makeAbsoluteURL(strWeb innerURL);
if (absoluteURL != null && !ntainsKey(absoluteURL)) {
urlMapput(absoluteURL innerURL);
ArrayList urlInfo = new ArrayList();
urlInfoadd(innerURL);
urlInfoadd(absoluteURL);
urlListadd(urlInfo);
}
tagsetAttribute(href absoluteURL);
}
}
return urlList;
}
/**
*<br>方法说明抽取网页包含的图像链接
*<br>输入参数nodes 网页标签集合; urlMap 已存在的url集合
*<br>返回类型图像链接集合
*/
private ArrayList extractAllImageNodes(NodeList nodes HashMap urlMap) {
ArrayList urlList = new ArrayList();
NodeList filtered = nodesextractAllNodesThatMatch(new TagNameFilter(IMG) true);
for (int i = ; i < filteredsize(); i++) {
Tag tag = (Tag) filteredelementAt(i);
String src = taggetAttribute(src);
// Handle external css files url
if (src != null && srclength() > ) {
String innerURL = src;
String absoluteURL = makeAbsoluteURL(strWeb innerURL);
if (absoluteURL != null && !ntainsKey(absoluteURL)) {
urlMapput(absoluteURL innerURL);
ArrayList urlInfo = new ArrayList();
urlInfoadd(innerURL);
urlInfoadd(absoluteURL);
urlListadd(urlInfo);
}
tagsetAttribute(src absoluteURL);
}
}
return urlList;
}
/**
*<br>方法说明相对路径转绝对路径
*<br>输入参数strWeb 网页地址; innerURL 相对路径链接
*<br>返回类型绝对路径链接
*/
public static String makeAbsoluteURL(URL strWeb String innerURL) {
// TODO Autogenerated method stub
//去除后缀
int pos = innerURLindexOf(?);
if (pos != ) {
innerURL = innerURLsubstring( pos);
}
if (innerURL != null
&& innerURLtoLowerCase()indexOf(http) == ) {
Systemoutprintln(innerURL);
return innerURL;
}
URL linkUri = null;
try {
linkUri = new URL(strWeb innerURL);
} catch (MalformedURLException e) {
//TODO Autogenerated catch block
eprintStackTrace();
return null;
}
String absURL = linkUritoString();
absURL = JHtmlClearreplace(absURL / );
absURL = JHtmlClearreplace(absURL / );
Systemoutprintln(absURL);
return absURL;
}
/**
*<br>方法说明创建mht文件
*<br>输入参数content 网页文本内容; urlScriptList 脚本链接集合; urlImageList 图片链接集合
*<br>返回类型
*/
private void createMhtArchive(String content ArrayList urlScriptList ArrayList urlImageList) throws Exception {
//Instantiate a Multipart object
MimeMultipart mp = new MimeMultipart(related);
Properties props = new Properties();
propsput(mailsmtphost smtp);
Session session = SessiongetDefaultInstance(props null);
MimeMessage msg = new MimeMessage(session);
// set mailer
msgsetHeader(XMailer Code Manager SWT);
// set from
if (from != null) {
msgsetFrom(new InternetAddress(from));
}
// set subject
if (subject != null) {
msgsetSubject(subject);
}
// to
if (to != null) {
InternetAddress[] toAddresses = getInetAddresses(to);
msgsetRecipients(MessageRecipientTypeTO toAddresses);
}
// cc
if (cc != null) {
InternetAddress[] ccAddresses = getInetAddresses(cc);
msgsetRecipients(MessageRecipientTypeCC ccAddresses);
}
// bcc
if (bcc != null) {
InternetAddress[] bccAddresses = getInetAddresses(bcc);
msgsetRecipients(MessageRecipientTypeBCC bccAddresses);
}
//设置网页正文
MimeBodyPart bp = new MimeBodyPart();
bpsetText(content strEncoding);
bpaddHeader(ContentType text/html;charset= + strEncoding);
bpaddHeader(ContentLocation strWebtoString());
mpaddBodyPart(bp);
int urlCount = urlScriptListsize();
for (int i = ; i < urlCount; i++) {
bp = new MimeBodyPart();
ArrayList urlInfo = (ArrayList) urlScriptListget(i);
// String url = urlInfoget()toString();
String absoluteURL = urlInfoget()toString();
bp
addHeader(ContentLocation
javaxmailinternetMimeUtility
encodeWord(URLDecoder
decode(absoluteURL strEncoding)));
DataSource source = new AttachmentDataSource(absoluteURL text);
bpsetDataHandler(new DataHandler(source));
mpaddBodyPart(bp);
}
urlCount = urlImageListsize();
for (int i = ; i < urlCount; i++) {
bp = new MimeBodyPart();
ArrayList urlInfo = (ArrayList) urlImageListget(i);
// String url = urlInfoget()toString();
String absoluteURL = urlInfoget()toString();
bp
addHeader(ContentLocation
javaxmailinternetMimeUtility
encodeWord(URLDecoder
decode(absoluteURL strEncoding)));
DataSource source = new AttachmentDataSource(absoluteURL image);
bpsetDataHandler(new DataHandler(source));
mpaddBodyPart(bp);
}
msgsetContent(mp);
// write the mime multi part message to a file
msgwriteTo(new FileOutputStream(strFileName));
}
/**
*<br>方法说明mht转html
*<br>输入参数strMht mht文件路径; strHtml html文件路径
*<br>返回类型
*/
public static void mhthtml(String strMht String strHtml) {
try {
//TODO readEmlFile
InputStream fis = new FileInputStream(strMht);
Session mailSession = SessiongetDefaultInstance(SystemgetProperties() null);
MimeMessage msg = new MimeMessage(mailSession fis);
Object content = msggetContent();
if (content instanceof Multipart) {
MimeMultipart mp = (MimeMultipart)content;
MimeBodyPart bp = (MimeBodyPart)mpgetBodyPart();
String strEncodng = getEncoding(bp);
String strText = getHtmlText(bp strEncodng);
if (strText == null)
return;
File parent = null;
if (mpgetCount() > ) {
parent = new File(new File(strHtml)getAbsolutePath() + files);
parentmkdirs();
if (!parentexists())
return;
}
for (int i = ; i < mpgetCount(); ++i) {
MimeBodyPart bp = (MimeBodyPart)mpgetBodyPart(i);
String strUrl = getResourcesUrl(bp);
if (strUrl == null)
continue;
DataHandler dataHandler = bpgetDataHandler();
MimePartDataSource source = (MimePartDataSource)dataHandlergetDataSource();
File resources = new File(parentgetAbsolutePath() + Fileseparator + getName(strUrl i));
if (saveResourcesFile(resources bpgetInputStream()))
strText = JHtmlClearreplace(strText strUrl resourcesgetAbsolutePath());
}
saveHtml(strText strHtml);
}
} catch (Exception e) {
// TODO Autogenerated catch block
eprintStackTrace();
}
}
/**
*<br>方法说明得到资源文件的name
*<br>输入参数strName 资源文件链接 ID 资源文件的序号
*<br>返回类型资源文件的本地临时文件名
*/
public static String getName(String strName int ID) {
char separator = /;
Systemoutprintln(strName);
Systemoutprintln(separator);
if( strNamelastIndexOf(separator) >= )
return format(strNamesubstring(strNamelastIndexOf(separator) + ));
return temp + ID;
}
/**
*<br>方法说明得到网页编码
*<br>输入参数bp MimeBodyPart类型的网页内容
*<br>返回类型MimeBodyPart里的网页内容的编码
*/
private static String getEncoding(MimeBodyPart bp) {
if (bp != null) {
try {
Enumeration list = bpgetAllHeaders();
while (listhasMoreElements()) {
javaxmailHeader head = (javaxmailHeader)listnextElement();
if (headgetName(pareTo(ContentType) == ) {
String strType = headgetValue();
int pos = strTypeindexOf(charset=);
if (pos != ) {
String strEncoding = strTypesubstring(pos + strTypelength());
if (strEncodingtoLowerCase(pareTo(gb) == ) {
strEncoding = gbk;
}
return strEncoding;
}
}
}
} catch (MessagingException e) {
// TODO Autogenerated catch block
eprintStackTrace();
}
}
return null;
}
/**
*<br>方法说明得到资源文件url
*<br>输入参数bp MimeBodyPart类型的网页内容
*<br>返回类型资源文件url
*/
private static String getResourcesUrl(MimeBodyPart bp) {
if (bp != null) {
try {
Enumeration list = bpgetAllHeaders();
while (listhasMoreElements()) {
javaxmailHeader head = (javaxmailHeader)listnextElement();
if (headgetName(pareTo(ContentLocation) == ) {
return headgetValue();
}
}
} catch (MessagingException e) {
// TODO Autogenerated catch block
eprintStackTrace();
}
}
return null;
}
/**
*<br>方法说明格式化文件名
*<br>输入参数strName 文件名
*<br>返回类型经过处理的符合命名规则的文件名
*/
private static String format(String strName) {
if (strName == null)
return null;
strName = strNamereplaceAll( );
String strText = \\/:*?\<>|^___FCKpd___quot;;
for (int i = ; i < strNamelength(); ++i) {
String ch = StringvalueOf(strNamecharAt(i));
if (strTextindexOf(ch) != ) {
strName = strNamereplace(strNamecharAt(i) );
}
}
return strName;
}
/**
*<br>方法说明保存资源文件
*<br>输入参数resources 要创建的资源文件; inputStream 要输入文件中的流
*<br>返回类型boolean
*/
private static boolean saveResourcesFile(File resources InputStream inputStream) {
if (resources == null || inputStream == null) {
return false;
}
BufferedInputStream in = null;
FileOutputStream fio = null;
BufferedOutputStream osw = null;
try {
in = new BufferedInputStream(inputStream);
fio = new FileOutputStream(resources);
osw = new BufferedOutputStream(new DataOutputStream(fio));
int b;
byte[] a = new byte[];
boolean isEmpty = true;
while ((b = inread(a)) != ) {
isEmpty = false;
oswwrite(a b);
oswflush();
}
oswclose();
fioclose();
inclose();
inputStreamclose();
if (isEmpty)
resourcesdelete();
return true;
} catch (Exception e) {
// TODO Autogenerated catch block
eprintStackTrace();
Systemoutprintln(解析mht失败);
return false;
} finally{
try {
if (osw != null)
oswclose();
if (fio != null)
fioclose();
if (in != null)
inclose();
if (inputStream != null)
inputStreamclose();
} catch (Exception e) {
eprintStackTrace();
Systemoutprintln(解析mht失败);
return false;
}
}
}
/**
*<br>方法说明得到mht文件的标题
*<br>输入参数mhtFilename mht文件名
*<br>返回类型mht文件的标题
*/
public static String getTitle(String mhtFilename) {
try {
//TODO readEmlFile
InputStream fis = new FileInputStream(mhtFilename);
Session mailSession = SessiongetDefaultInstance(SystemgetProperties() null);
MimeMessage msg = new MimeMessage(mailSession fis);
Object content = msggetContent();
if (content instanceof Multipart) {
MimeMultipart mp = (MimeMultipart)content;
MimeBodyPart bp = (MimeBodyPart)mpgetBodyPart();
String strEncodng = getEncoding(bp);
String strText = getHtmlText(bp strEncodng);
if (strText == null)
return null;
strText = strTexttoLowerCase();
int pos = strTextindexOf(<title>);
int pos = strTextindexOf(</title>);
if (pos != && pos!= && pos > pos) {
return strTextsubstring(pos + pos)trim();
}
}
return null;
} catch (Exception e) {
// TODO Autogenerated catch block
eprintStackTrace();
return null;
}
}
/**
*<br>方法说明得到html文本
*<br>输入参数bp MimeBodyPart类型的网页内容; strEncoding 内容编码
*<br>返回类型html文本
*/
private static String getHtmlText(MimeBodyPart bp String strEncoding) {
InputStream textStream = null;
BufferedInputStream buff = null;
BufferedReader br = null;
Reader r = null;
try {
textStream = bpgetInputStream();
buff = new BufferedInputStream(textStream);
r = new InputStreamReader(buff strEncoding);
br = new BufferedReader(r);
StringBuffer strHtml = new StringBuffer();
String strLine = null;
while ((strLine = brreadLine()) != null) {
strHtmlappend(strLine + \r\n);
}
brclose();
rclose();
textStreamclose();
return strHtmltoString();
} catch (Exception e) {
// TODO Autogenerated catch block
eprintStackTrace();
} finally{
try{
if (br != null)
brclose();
if (buff != null)
buffclose();
if (textStream != null)
textStreamclose();
}catch(Exception e){
Systemoutprintln(解析mht失败);
}
}
return null;
}
/**
*<br>方法说明保存html文件
*<br>输入参数strText html内容; strHtml html文件名
*<br>返回类型
*/
private static void saveHtml(String strText String strHtml) {
try {
FileWriter fw = new FileWriter(strHtml);
fwwrite(strText);
fwclose();
} catch (IOException e) {
// TODO Autogenerated catch block
eprintStackTrace();
Systemoutprintln(解析mht失败);
}
}
private InternetAddress[] getInetAddresses(String emails) throws Exception {
ArrayList list = new ArrayList();
StringTokenizer tok = new StringTokenizer(emails );
while (tokhasMoreTokens()) {
listadd(toknextToken());
}
int count = listsize();
InternetAddress[] addresses = new InternetAddress[count];
for (int i = ; i < count; i++) {
addresses[i] = new InternetAddress(listget(i)toString());
}
return addresses;
}
class AttachmentDataSource implements DataSource {
private MimetypesFileTypeMap map = new MimetypesFileTypeMap();
private String strUrl;
private String strType;
private byte[] dataSize = null;
/**
* This is some content type maps
*/
private Map normalMap = new HashMap();
{
// Initiate normal mime type map
// Images
normalMapput(image image/jpeg);
normalMapput(text text/plain);
}
public AttachmentDataSource(String strUrl String strType) {
thisstrType = strType;
thisstrUrl = strUrl;
strUrl = strUrltrim();
strUrl = strUrlreplaceAll( %);
dataSize = JQuerydownBinaryFile(strUrl null);
}
/**
* Returns the content type
*/
public String getContentType() {
return getMimeType(getName());
}
public String getName() {
char separator = FileseparatorChar;
if( strUrllastIndexOf(separator) >= )
return strUrlsubstring(strUrllastIndexOf(separator) + );
return strUrl;
}
private String getMimeType(String fileName) {
String type = (String)normalMapget(strType);
if (type == null) {
try {
type = mapgetContentType(fileName);
} catch (Exception e) {
// TODO: handle exception
}
Systemoutprintln(type);
// Fix the null exception
if (type == null) {
type = application/octetstream;
}
}
return type;
}
public InputStream getInputStream() throws IOException {
// TODO Autogenerated method stub
if (dataSize == null)
dataSize = new byte[];
return new ByteArrayInputStream(dataSize);
}
public OutputStream getOutputStream() throws IOException {
// TODO Autogenerated method stub
return new javaioByteArrayOutputStream();
}
}
}