抓取各大网站的数据插入数据库这样就不用为没有数据而烦恼了
获取百度的歌曲名歌手和链接!!
package webTools;
import javaioBufferedReader;
import javaioIOException;
import javaioInputStreamReader;
import javaioUnsupportedEncodingException;
import MalformedURLException;
import URL;
import javautilArrayList;
import javautilHashMap;
import javautilList;
import javautilregexMatcher;
import javautilregexPattern;
import dbToolsDBTools;
public class IOTOWeb {
public String getHtmlContent(String htmlURL) {
URL url = null;
String rowContent = ;
StringBuffer htmlContent = new StringBuffer();
try {
url = new URL(htmlURL);
BufferedReader in = new BufferedReader(new InputStreamReader(url
openStream() gb));
while ((rowContent = inreadLine()) != null) {
htmlContentappend(rowContent);
}
inclose();
} catch (MalformedURLException e) {
// TODO Autogenerated catch block
eprintStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Autogenerated catch block
eprintStackTrace();
} catch (IOException e) {
// TODO Autogenerated catch block
eprintStackTrace();
}
return htmlContenttoString();
}
public List getLink(String htmlContent) {
ArrayList listLink = new ArrayList();
String regex = <td[^>]*>[\\(]*<a[^>]*href=(\([^\]*)\|\([^\]*)\|([^\\s>]*))[^>]*>(*?)[\\)]*[\\s]*</td>;
Pattern pattern = pile(regex PatternDOTALL);
Matcher matcher = patternmatcher(htmlContent);
while (matcherfind()) {
listLinkadd(matchergroup());
}
return listLink;
}
public List<String> getHref(String htmlContent) {
String regex;
List listtHref = new ArrayList();
regex = href=(\([^\]*)\|\([^\]*)\|([^\\s>]*))\;
Pattern pa = pile(regex PatternDOTALL);
Matcher ma = pamatcher(htmlContent);
while (mafind()) {
listtHrefadd(magroup()replaceFirst(href=\ )replace(\
));
}
return listtHref;
}
public List<String> getPerson(String htmlContent) {
String regex;
List list = new ArrayList();
regex = ]*href=(\"([^\"]*)\"|\([^\]*)\|([^\\s>]*))[^>]*>(*?)\\>\\(<a[^>]*href=(\([^\]*)\|\([^\]*)\|([^\\s>]*))[^>]*>(*?)\\);
Pattern pa = pile(regex PatternDOTALL);
Matcher ma = pamatcher(htmlContent);
while (mafind()) {
listadd(magroup()replaceFirst(href=\ )replace(\ ));
}
return list;
}
public List<String> getSongName(String htmlContent) {
String regex;
List listPerson = new ArrayList();
regex = <a[^>]*href=(\([^\]*)\|\([^\]*)\|([^\\s>]*))[^>]*>(*?)</a>\\s;
Pattern pa = pile(regex PatternDOTALL);
Matcher ma = pamatcher(htmlContent);
while (mafind()) {
listPersonadd(magroup());
}
return listPerson;
}
public String getMainContent(String htmlContent) {
String regex = <table width=\%\ align=\center\ cellpadding=\\ cellspacing=\\ class=\list\>(*?)</table>;
StringBuffer mainContent = new StringBuffer();
Pattern pattern = pile(regex PatternDOTALL);
Matcher matcher = patternmatcher(htmlContent);
while (matcherfind()) {
mainContentappend(matchergroup());
}
return mainContenttoString();
}
public String outTag(final String s) {
return sreplaceAll(<*?> );
}
DBTools dbTools = new DBTools();
public void getFromBaiduMap(String htmlURL) throws Throwable {
HashMap htmlContentMap = new HashMap();
String htmlContent = getHtmlContent(htmlURL);
String mainContent = getMainContent(htmlContent);
List listLink = getLink(mainContent);
for (int j = ; j < listLinksize(); j++) {
String tdTag = listLinkget(j)toString();
List songNameList = getSongName(tdTag);
String songName = outTag(songNameListget()toString());
List personList = getPerson(tdTag);
String songPerson = ;
if (personListsize() != ) {
for (int n = ; n < personListsize(); n++) {
// Systemoutprintln(personListget(n)toString());
songPerson = outTag(personListget(n)toString());
}
} else {
songPerson = 无;
}
// Systemoutprint(songNameListget()toString());
List hrefList = getHref(songNameListget()toString());
String songHref = hrefListget()toString();
Systemoutprintln();
String sql = insert into song(songNamesongPersonsongHref) values(???);
ArrayList list_values = new ArrayList();
list_valuesadd(songName);
list_valuesadd(songPerson);
list_valuesadd(songHref);
dbToolsupdate(sql list_values);
}
}
}
DBTools数据库链接类
package dbTools;
import javautilArrayList;
import javasql*;
public class DBTools {
private PreparedStatement preparedStatement;
private ResultSet resultSet;
private Connection connection;
public DBTools() {
try {
ClassforName(commysqljdbcDriver);
} catch (ClassNotFoundException e) {
// TODO Autogenerated catch block
eprintStackTrace();
}
try {
connection = DriverManagergetConnection(
jdbc:mysql://localhost:/TestURL root zhuyi);
} catch (SQLException e) {
// TODO Autogenerated catch block
eprintStackTrace();
}
}
public ArrayList query(String sql ArrayList list_values) throws Throwable {
ArrayList listRows = new ArrayList();
preparedStatement = connectionprepareStatement(sql);
for (int i = ; i < list_valuessize(); i++) {
preparedStatementsetObject(i + list_valuesget(i));
}
resultSet = preparedStatementexecuteQuery();
while (resultSetnext()) {
String[] rowinfo = new String[resultSetgetMetaData()
getColumnCount()];
for (int i = ; i < rowinfolength; i++) {
rowinfo[i] = resultSetgetString(i + );
}
listRowsadd(rowinfo);
}
return listRows;
}
public void update(String sql ArrayList list_values) throws Throwable {
preparedStatement = connectionprepareStatement(sql);
for (int i = ; i < list_valuessize(); i++) {
preparedStatementsetObject(i + list_valuesget(i));
}
preparedStatementexecuteUpdate();
preparedStatementclose();
}
}
Servlet调用
代码
package controller;
import javaioIOException;
import javaioPrintWriter;
import javautilList;
import javaxservletServletException;
import javaxservlethttpHttpServlet;
import javaxservlethttpHttpServletRequest;
import javaxservlethttpHttpServletResponse;
import webToolsIOTOWeb;
public class TestURL extends HttpServlet {
/**
* Constructor of the object
*/
public TestURL() {
super();
}
/**
* Destruction of the servlet <br>
*/
public void destroy() {
superdestroy(); // Just puts destroy string in log
// Put your code here
}
/**
* The doGet method of the servlet <br>
*
* This method is called when a form has its tag value method equals to get
*
* @param request
* the request send by the client to the server
* @param response
* the response send by the server to the client
* @throws ServletException
* if an error occurred
* @throws IOException
* if an error occurred
*/
public void doGet(HttpServletRequest request HttpServletResponse response)
throws ServletException IOException {
try {
IOTOWeb iotoWeb = new IOTOWeb();
iotoWebgetFromBaiduMap(?id=?top);
} catch (Throwable e) {
// TODO Autogenerated catch block
eprintStackTrace();
}
}
/**
* The doPost method of the servlet <br>
*
* This method is called when a form has its tag value method equals to
* post
*
* @param request
* the request send by the client to the server
* @param response
* the response send by the server to the client
* @throws ServletException
* if an error occurred
* @throws IOException
* if an error occurred
*/
public void doPost(HttpServletRequest request HttpServletResponse response)
throws ServletException IOException {
responsesetContentType(text/html);
PrintWriter out = responsegetWriter();
out
println(<!DOCTYPE HTML PUBLIC \//WC//DTD HTML Transitional//EN\>);
outprintln(<HTML>);
outprintln( <HEAD><TITLE>A Servlet</TITLE></HEAD>);
outprintln( <BODY>);
outprint( This is );
outprint(thisgetClass());
outprintln( using the POST method);
outprintln( </BODY>);
outprintln(</HTML>);
outflush();
outclose();
}
/**
* Initialization of the servlet <br>
*
* @throws ServletException
* if an error occurs
*/
public void init() throws ServletException {
// Put your code here
}
}
获取金书网的图书名
代码
package webTools;
import javaioBufferedReader;
import javaioInputStreamReader;
import URL;
import javautilArrayList;
import javautilList;
import javautilregexMatcher;
import javautilregexPattern;
import dbToolsDBTools;
public class GetBook {
public String getHtmlContent(String htmlURL) throws Throwable {
URL url = null;
String rowContent = ;
StringBuffer htmlContent = new StringBuffer();
url = new URL(htmlURL);
BufferedReader in = new BufferedReader(new InputStreamReader(url
openStream() gb));
while ((rowContent = inreadLine()) != null) {
htmlContentappend(rowContent);
}
inclose();
return htmlContenttoString();
}
public String getBookName(String htmlContent) {
String bookName = ;
String regex = <span class=\style\>[^>]*</span>;
Pattern pattern = pile(regex PatternDOTALL);
Matcher matcher = patternmatcher(htmlContent);
if (matcherfind()) {
bookName = matchergroup();
}
return bookName;
}
public String outTag(final String s) {
return sreplaceAll(<*?> );
}
DBTools dbtools = new DBTools();
public void getFromJINSHU(String htmlURL) throws Throwable {
String htmlContent = getHtmlContent(htmlURL);
String bookName = outTag(getBookName(htmlContent));
if (bookName != null && !equals(bookName)) {
Systemoutprintln(bookName);
String sql = insert into bookinfo(bookName) values(?);
ArrayList list_values = new ArrayList();
list_valuesadd(bookName);
dbtoolsupdate(sql list_values);
}
}
}
调用Servlet
代码
package controller;
import javaioIOException;
import javaioPrintWriter;
import javaxservletServletException;
import javaxservlethttpHttpServlet;
import javaxservlethttpHttpServletRequest;
import javaxservlethttpHttpServletResponse;
import webToolsGetBook;
public class TestBook extends HttpServlet {
/**
* Constructor of the object
*/
public TestBook() {
super();
}
/**
* Destruction of the servlet <br>
*/
public void destroy() {
superdestroy(); // Just puts destroy string in log
// Put your code here
}
/**
* The doGet method of the servlet <br>
*
* This method is called when a form has its tag value method equals to get
*
* @param request
* the request send by the client to the server
* @param response
* the response send by the server to the client
* @throws ServletException
* if an error occurred
* @throws IOException
* if an error occurred
*/
int i = ;
public void doGet(HttpServletRequest request HttpServletResponse response)
throws ServletException IOException {
GetBook bookinfo = new GetBook();
for (; i < ; i++) {
String bookURL = /booksinfo// + i
+ l;
try {
bookinfogetFromJINSHU(bookURL);
} catch (Throwable e) {
i++;
doPost(request response);
}
}
}
/**
* The doPost method of the servlet <br>
*
* This method is called when a form has its tag value method equals to
* post
*
* @param request
* the request send by the client to the server
* @param response
* the response send by the server to the client
* @throws ServletException
* if an error occurred
* @throws IOException
* if an error occurred
*/
public void doPost(HttpServletRequest request HttpServletResponse response)
throws ServletException IOException {
GetBook bookinfo = new GetBook();
for (; i < ; i++) {
String bookURL = /booksinfo// + i
+ l;
try {
bookinfogetFromJINSHU(bookURL);
} catch (Throwable e) {
i++;
doGet(request response);
}
}
}
/**
* Initialization of the servlet <br>
*
* @throws ServletException
* if an error occurs
*/
public void init() throws ServletException {
// Put your code here
}
}
每种功能的实现方法有很多希望各位可以交流不同的思想和方法