java

位置:IT落伍者 >> java >> 浏览文章

使用java将网页保存为mht格式


发布日期:2018年12月19日
 
使用java将网页保存为mht格式

package comtag;

import javaioBufferedInputStream;

import javaioBufferedOutputStream;

import javaioBufferedReader;

import javaioByteArrayInputStream;

import javaioDataOutputStream;

import javaioFile;

import javaioFileInputStream;

import javaioFileOutputStream;

import javaioFileWriter;

import javaioIOException;

import javaioInputStream;

import javaioInputStreamReader;

import javaioOutputStream;

import javaioReader;

import MalformedURLException;

import URL;

import javautil*;

import lparserParser;

import lparserTag;

import lparserfiltersTagNameFilter;

import lparserlexerLexer;

import lparserlexerPage;

import lparserutilDefaultParserFeedback;

import lparserutilNodeList;

import lparserutilParserException;

import toptracktoolsJQuery;

import javaxactivationDataHandler;

import javaxactivationDataSource;

import javaxactivationMimetypesFileTypeMap;

import javaxmailMessage;

import javaxmailMessagingException;

import javaxmailMultipart;

import javaxmailSession;

import javaxmailinternetInternetAddress;

import javaxmailinternetMimeBodyPart;

import javaxmailinternetMimeMessage;

import javaxmailinternetMimeMultipart;

import javaxmailinternetMimePartDataSource;

/**

* mht文件解析类

* @author dl

*/

public class HtmlMHTCompiler {

private URL strWeb = null; /**网页地址*/

private String strText = null; /**网页文本内容*/

private String strFileName = null; /**本地文件名*/

private String strEncoding = null; /**网页编码*/

//mht格式附加信息

private String from = ;

private String to;

private String subject = mht compile;

private String cc;

private String bcc;

private String smtp = localhost;

public static void main(String[] args) {

String strUrl = ;

String strEncoding = utf;

String strText = JQuerygetHtmlText(strUrl strEncoding null);

if (strText == null)

return;

HtmlMHTCompiler ht = new HtmlMHTCompiler(strText strUrl strEncoding testmht);

pile();

//HtmlMHTCompilermhthtml(testmht l);

}

/**

*<br>方法说明初始化

*<br>输入参数strText 网页文本内容; strUrl 网页地址; strEncoding 网页编码; strFileName 本地文件名

*<br>返回类型

*/

public HtmlMHTCompiler(String strText String strUrl String strEncoding String strFileName) {

// TODO Autogenerated constructor stub

try {

strWeb = new URL(strUrl);

} catch (MalformedURLException e) {

// TODO Autogenerated catch block

eprintStackTrace();

return;

}

thisstrText = strText;

thisstrEncoding = strEncoding;

thisstrFileName = strFileName;

}

/**

*<br>方法说明执行下载操作

*<br>输入参数

*<br>返回类型

*/

public boolean compile() {

if (strWeb == null || strText == null || strFileName == null || strEncoding == null)

return false;

HashMap urlMap = new HashMap();

NodeList nodes = new NodeList();

try {

Parser parser = createParser(strText);

parsersetEncoding(strEncoding);

nodes = parserparse(null);

} catch (ParserException e) {

// TODO Autogenerated catch block

eprintStackTrace();

}

extractAllScriptNodes(nodes);

ArrayList urlScriptList = extractAllScriptNodes(nodes urlMap);

ArrayList urlImageList = extractAllImageNodes(nodes urlMap);

for (Iterator iter = urlMapentrySet(erator(); iterhasNext();) {

MapEntry entry = (MapEntry) iternext();

String key = (String)entrygetKey();

String val = (String)entrygetValue();

strText = JHtmlClearreplace(strText val key);

}

try {

createMhtArchive(strText urlScriptList urlImageList);

} catch (Exception e) {

// TODO Autogenerated catch block

eprintStackTrace();

return false;

}

return true;

}

/**

*<br>方法说明建立HTML parser

*<br>输入参数inputHTML 网页文本内容

*<br>返回类型HTML parser

*/

private Parser createParser(String inputHTML) {

// TODO Autogenerated method stub

Lexer mLexer = new Lexer(new Page(inputHTML));

return new Parser(mLexer new DefaultParserFeedback(DefaultParserFeedbackQUIET));

}

/**

*<br>方法说明抽取基础URL地址

*<br>输入参数nodes 网页标签集合

*<br>返回类型

*/

private void extractAllScriptNodes(NodeList nodes) {

NodeList filtered = nodesextractAllNodesThatMatch(new TagNameFilter(

BASE) true);

if (filtered != null && filteredsize() > ) {

Tag tag = (Tag) filteredelementAt();

String href = taggetAttribute(href);

if (href != null && hreflength() > ) {

try {

strWeb = new URL(href);

} catch (MalformedURLException e) {

// TODO Autogenerated catch block

eprintStackTrace();

}

}

}

}

/**

*<br>方法说明抽取网页包含的cssjs链接

*<br>输入参数nodes 网页标签集合; urlMap 已存在的url集合

*<br>返回类型cssjs链接的集合

*/

private ArrayList extractAllScriptNodes(NodeList nodes HashMap urlMap) {

ArrayList urlList = new ArrayList();

NodeList filtered = nodesextractAllNodesThatMatch(new TagNameFilter(script) true);

for (int i = ; i < filteredsize(); i++) {

Tag tag = (Tag) filteredelementAt(i);

String src = taggetAttribute(src);

// Handle external css files url

if (src != null && srclength() > ) {

String innerURL = src;

String absoluteURL = makeAbsoluteURL(strWeb innerURL);

if (absoluteURL != null && !ntainsKey(absoluteURL)) {

urlMapput(absoluteURL innerURL);

ArrayList urlInfo = new ArrayList();

urlInfoadd(innerURL);

urlInfoadd(absoluteURL);

urlListadd(urlInfo);

}

tagsetAttribute(src absoluteURL);

}

}

filtered = nodesextractAllNodesThatMatch(new TagNameFilter(link) true);

for (int i = ; i < filteredsize(); i++) {

Tag tag = (Tag) filteredelementAt(i);

String type = (taggetAttribute(type));

String rel = (taggetAttribute(rel));

String href = taggetAttribute(href);

boolean isCssFile = false;

if (rel != null) {

isCssFile = relindexOf(stylesheet) != ;

} else if (type != null) {

isCssFile |= typeindexOf(text/css) != ;

}

// Handle external css files url

if (isCssFile && href != null && hreflength() > ) {

String innerURL = href;

String absoluteURL = makeAbsoluteURL(strWeb innerURL);

if (absoluteURL != null && !ntainsKey(absoluteURL)) {

urlMapput(absoluteURL innerURL);

ArrayList urlInfo = new ArrayList();

urlInfoadd(innerURL);

urlInfoadd(absoluteURL);

urlListadd(urlInfo);

}

tagsetAttribute(href absoluteURL);

}

}

return urlList;

}

/**

*<br>方法说明抽取网页包含的图像链接

*<br>输入参数nodes 网页标签集合; urlMap 已存在的url集合

*<br>返回类型图像链接集合

*/

private ArrayList extractAllImageNodes(NodeList nodes HashMap urlMap) {

ArrayList urlList = new ArrayList();

NodeList filtered = nodesextractAllNodesThatMatch(new TagNameFilter(IMG) true);

for (int i = ; i < filteredsize(); i++) {

Tag tag = (Tag) filteredelementAt(i);

String src = taggetAttribute(src);

// Handle external css files url

if (src != null && srclength() > ) {

String innerURL = src;

String absoluteURL = makeAbsoluteURL(strWeb innerURL);

if (absoluteURL != null && !ntainsKey(absoluteURL)) {

urlMapput(absoluteURL innerURL);

ArrayList urlInfo = new ArrayList();

urlInfoadd(innerURL);

urlInfoadd(absoluteURL);

urlListadd(urlInfo);

}

tagsetAttribute(src absoluteURL);

}

}

return urlList;

}

/**

*<br>方法说明相对路径转绝对路径

*<br>输入参数strWeb 网页地址; innerURL 相对路径链接

*<br>返回类型绝对路径链接

*/

public static String makeAbsoluteURL(URL strWeb String innerURL) {

// TODO Autogenerated method stub

//去除后缀

int pos = innerURLindexOf(?);

if (pos != ) {

innerURL = innerURLsubstring( pos);

}

if (innerURL != null

&& innerURLtoLowerCase()indexOf(http) == ) {

Systemoutprintln(innerURL);

return innerURL;

}

URL linkUri = null;

try {

linkUri = new URL(strWeb innerURL);

} catch (MalformedURLException e) {

//TODO Autogenerated catch block

eprintStackTrace();

return null;

}

String absURL = linkUritoString();

absURL = JHtmlClearreplace(absURL / );

absURL = JHtmlClearreplace(absURL / );

Systemoutprintln(absURL);

return absURL;

}

/**

*<br>方法说明创建mht文件

*<br>输入参数content 网页文本内容; urlScriptList 脚本链接集合; urlImageList 图片链接集合

*<br>返回类型

*/

private void createMhtArchive(String content ArrayList urlScriptList ArrayList urlImageList) throws Exception {

//Instantiate a Multipart object

MimeMultipart mp = new MimeMultipart(related);

Properties props = new Properties();

propsput(mailsmtphost smtp);

Session session = SessiongetDefaultInstance(props null);

MimeMessage msg = new MimeMessage(session);

// set mailer

msgsetHeader(XMailer Code Manager SWT);

// set from

if (from != null) {

msgsetFrom(new InternetAddress(from));

}

// set subject

if (subject != null) {

msgsetSubject(subject);

}

// to

if (to != null) {

InternetAddress[] toAddresses = getInetAddresses(to);

msgsetRecipients(MessageRecipientTypeTO toAddresses);

}

// cc

if (cc != null) {

InternetAddress[] ccAddresses = getInetAddresses(cc);

msgsetRecipients(MessageRecipientTypeCC ccAddresses);

}

// bcc

if (bcc != null) {

InternetAddress[] bccAddresses = getInetAddresses(bcc);

msgsetRecipients(MessageRecipientTypeBCC bccAddresses);

}

//设置网页正文

MimeBodyPart bp = new MimeBodyPart();

bpsetText(content strEncoding);

bpaddHeader(ContentType text/html;charset= + strEncoding);

bpaddHeader(ContentLocation strWebtoString());

mpaddBodyPart(bp);

int urlCount = urlScriptListsize();

for (int i = ; i < urlCount; i++) {

bp = new MimeBodyPart();

ArrayList urlInfo = (ArrayList) urlScriptListget(i);

// String url = urlInfoget()toString();

String absoluteURL = urlInfoget()toString();

bp

addHeader(ContentLocation

javaxmailinternetMimeUtility

encodeWord(URLDecoder

decode(absoluteURL strEncoding)));

DataSource source = new AttachmentDataSource(absoluteURL text);

bpsetDataHandler(new DataHandler(source));

mpaddBodyPart(bp);

}

urlCount = urlImageListsize();

for (int i = ; i < urlCount; i++) {

bp = new MimeBodyPart();

ArrayList urlInfo = (ArrayList) urlImageListget(i);

// String url = urlInfoget()toString();

String absoluteURL = urlInfoget()toString();

bp

addHeader(ContentLocation

javaxmailinternetMimeUtility

encodeWord(URLDecoder

decode(absoluteURL strEncoding)));

DataSource source = new AttachmentDataSource(absoluteURL image);

bpsetDataHandler(new DataHandler(source));

mpaddBodyPart(bp);

}

msgsetContent(mp);

// write the mime multi part message to a file

msgwriteTo(new FileOutputStream(strFileName));

}

/**

*<br>方法说明mht转html

*<br>输入参数strMht mht文件路径; strHtml html文件路径

*<br>返回类型

*/

public static void mhthtml(String strMht String strHtml) {

try {

//TODO readEmlFile

InputStream fis = new FileInputStream(strMht);

Session mailSession = SessiongetDefaultInstance(SystemgetProperties() null);

MimeMessage msg = new MimeMessage(mailSession fis);

Object content = msggetContent();

if (content instanceof Multipart) {

MimeMultipart mp = (MimeMultipart)content;

MimeBodyPart bp = (MimeBodyPart)mpgetBodyPart();

String strEncodng = getEncoding(bp);

String strText = getHtmlText(bp strEncodng);

if (strText == null)

return;

File parent = null;

if (mpgetCount() > ) {

parent = new File(new File(strHtml)getAbsolutePath() + files);

parentmkdirs();

if (!parentexists())

return;

}

for (int i = ; i < mpgetCount(); ++i) {

MimeBodyPart bp = (MimeBodyPart)mpgetBodyPart(i);

String strUrl = getResourcesUrl(bp);

if (strUrl == null)

continue;

DataHandler dataHandler = bpgetDataHandler();

MimePartDataSource source = (MimePartDataSource)dataHandlergetDataSource();

File resources = new File(parentgetAbsolutePath() + Fileseparator + getName(strUrl i));

if (saveResourcesFile(resources bpgetInputStream()))

strText = JHtmlClearreplace(strText strUrl resourcesgetAbsolutePath());

}

saveHtml(strText strHtml);

}

} catch (Exception e) {

// TODO Autogenerated catch block

eprintStackTrace();

}

}

/**

*<br>方法说明得到资源文件的name

*<br>输入参数strName 资源文件链接 ID 资源文件的序号

*<br>返回类型资源文件的本地临时文件名

*/

public static String getName(String strName int ID) {

char separator = /;

Systemoutprintln(strName);

Systemoutprintln(separator);

if( strNamelastIndexOf(separator) >= )

return format(strNamesubstring(strNamelastIndexOf(separator) + ));

return temp + ID;

}

/**

*<br>方法说明得到网页编码

*<br>输入参数bp MimeBodyPart类型的网页内容

*<br>返回类型MimeBodyPart里的网页内容的编码

*/

private static String getEncoding(MimeBodyPart bp) {

if (bp != null) {

try {

Enumeration list = bpgetAllHeaders();

while (listhasMoreElements()) {

javaxmailHeader head = (javaxmailHeader)listnextElement();

if (headgetName(pareTo(ContentType) == ) {

String strType = headgetValue();

int pos = strTypeindexOf(charset=);

if (pos != ) {

String strEncoding = strTypesubstring(pos + strTypelength());

if (strEncodingtoLowerCase(pareTo(gb) == ) {

strEncoding = gbk;

}

return strEncoding;

}

}

}

} catch (MessagingException e) {

// TODO Autogenerated catch block

eprintStackTrace();

}

}

return null;

}

/**

*<br>方法说明得到资源文件url

*<br>输入参数bp MimeBodyPart类型的网页内容

*<br>返回类型资源文件url

*/

private static String getResourcesUrl(MimeBodyPart bp) {

if (bp != null) {

try {

Enumeration list = bpgetAllHeaders();

while (listhasMoreElements()) {

javaxmailHeader head = (javaxmailHeader)listnextElement();

if (headgetName(pareTo(ContentLocation) == ) {

return headgetValue();

}

}

} catch (MessagingException e) {

// TODO Autogenerated catch block

eprintStackTrace();

}

}

return null;

}

/**

*<br>方法说明格式化文件名

*<br>输入参数strName 文件名

*<br>返回类型经过处理的符合命名规则的文件名

*/

private static String format(String strName) {

if (strName == null)

return null;

strName = strNamereplaceAll( );

String strText = \\/:*?\<>|^___FCKpd___quot;;

for (int i = ; i < strNamelength(); ++i) {

String ch = StringvalueOf(strNamecharAt(i));

if (strTextindexOf(ch) != ) {

strName = strNamereplace(strNamecharAt(i) );

}

}

return strName;

}

/**

*<br>方法说明保存资源文件

*<br>输入参数resources 要创建的资源文件; inputStream 要输入文件中的流

*<br>返回类型boolean

*/

private static boolean saveResourcesFile(File resources InputStream inputStream) {

if (resources == null || inputStream == null) {

return false;

}

BufferedInputStream in = null;

FileOutputStream fio = null;

BufferedOutputStream osw = null;

try {

in = new BufferedInputStream(inputStream);

fio = new FileOutputStream(resources);

osw = new BufferedOutputStream(new DataOutputStream(fio));

int b;

byte[] a = new byte[];

boolean isEmpty = true;

while ((b = inread(a)) != ) {

isEmpty = false;

oswwrite(a b);

oswflush();

}

oswclose();

fioclose();

inclose();

inputStreamclose();

if (isEmpty)

resourcesdelete();

return true;

} catch (Exception e) {

// TODO Autogenerated catch block

eprintStackTrace();

Systemoutprintln(解析mht失败);

return false;

} finally{

try {

if (osw != null)

oswclose();

if (fio != null)

fioclose();

if (in != null)

inclose();

if (inputStream != null)

inputStreamclose();

} catch (Exception e) {

eprintStackTrace();

Systemoutprintln(解析mht失败);

return false;

}

}

}

/**

*<br>方法说明得到mht文件的标题

*<br>输入参数mhtFilename mht文件名

*<br>返回类型mht文件的标题

*/

public static String getTitle(String mhtFilename) {

try {

//TODO readEmlFile

InputStream fis = new FileInputStream(mhtFilename);

Session mailSession = SessiongetDefaultInstance(SystemgetProperties() null);

MimeMessage msg = new MimeMessage(mailSession fis);

Object content = msggetContent();

if (content instanceof Multipart) {

MimeMultipart mp = (MimeMultipart)content;

MimeBodyPart bp = (MimeBodyPart)mpgetBodyPart();

String strEncodng = getEncoding(bp);

String strText = getHtmlText(bp strEncodng);

if (strText == null)

return null;

strText = strTexttoLowerCase();

int pos = strTextindexOf(<title>);

int pos = strTextindexOf(</title>);

if (pos != && pos!= && pos > pos) {

return strTextsubstring(pos + pos)trim();

}

}

return null;

} catch (Exception e) {

// TODO Autogenerated catch block

eprintStackTrace();

return null;

}

}

/**

*<br>方法说明得到html文本

*<br>输入参数bp MimeBodyPart类型的网页内容; strEncoding 内容编码

*<br>返回类型html文本

*/

private static String getHtmlText(MimeBodyPart bp String strEncoding) {

InputStream textStream = null;

BufferedInputStream buff = null;

BufferedReader br = null;

Reader r = null;

try {

textStream = bpgetInputStream();

buff = new BufferedInputStream(textStream);

r = new InputStreamReader(buff strEncoding);

br = new BufferedReader(r);

StringBuffer strHtml = new StringBuffer();

String strLine = null;

while ((strLine = brreadLine()) != null) {

strHtmlappend(strLine + \r\n);

}

brclose();

rclose();

textStreamclose();

return strHtmltoString();

} catch (Exception e) {

// TODO Autogenerated catch block

eprintStackTrace();

} finally{

try{

if (br != null)

brclose();

if (buff != null)

buffclose();

if (textStream != null)

textStreamclose();

}catch(Exception e){

Systemoutprintln(解析mht失败);

}

}

return null;

}

/**

*<br>方法说明保存html文件

*<br>输入参数strText html内容; strHtml html文件名

*<br>返回类型

*/

private static void saveHtml(String strText String strHtml) {

try {

FileWriter fw = new FileWriter(strHtml);

fwwrite(strText);

fwclose();

} catch (IOException e) {

// TODO Autogenerated catch block

eprintStackTrace();

Systemoutprintln(解析mht失败);

}

}

private InternetAddress[] getInetAddresses(String emails) throws Exception {

ArrayList list = new ArrayList();

StringTokenizer tok = new StringTokenizer(emails );

while (tokhasMoreTokens()) {

listadd(toknextToken());

}

int count = listsize();

InternetAddress[] addresses = new InternetAddress[count];

for (int i = ; i < count; i++) {

addresses[i] = new InternetAddress(listget(i)toString());

}

return addresses;

}

class AttachmentDataSource implements DataSource {

private MimetypesFileTypeMap map = new MimetypesFileTypeMap();

private String strUrl;

private String strType;

private byte[] dataSize = null;

/**

* This is some content type maps

*/

private Map normalMap = new HashMap();

{

// Initiate normal mime type map

// Images

normalMapput(image image/jpeg);

normalMapput(text text/plain);

}

public AttachmentDataSource(String strUrl String strType) {

thisstrType = strType;

thisstrUrl = strUrl;

strUrl = strUrltrim();

strUrl = strUrlreplaceAll( %);

dataSize = JQuerydownBinaryFile(strUrl null);

}

/**

* Returns the content type

*/

public String getContentType() {

return getMimeType(getName());

}

public String getName() {

char separator = FileseparatorChar;

if( strUrllastIndexOf(separator) >= )

return strUrlsubstring(strUrllastIndexOf(separator) + );

return strUrl;

}

private String getMimeType(String fileName) {

String type = (String)normalMapget(strType);

if (type == null) {

try {

type = mapgetContentType(fileName);

} catch (Exception e) {

// TODO: handle exception

}

Systemoutprintln(type);

// Fix the null exception

if (type == null) {

type = application/octetstream;

}

}

return type;

}

public InputStream getInputStream() throws IOException {

// TODO Autogenerated method stub

if (dataSize == null)

dataSize = new byte[];

return new ByteArrayInputStream(dataSize);

}

public OutputStream getOutputStream() throws IOException {

// TODO Autogenerated method stub

return new javaioByteArrayOutputStream();

}

}

}

               

上一篇:Java元数据总结:Java注释的使用和定义

下一篇:Java调用Windows控制台命令