-
大小:文件類型: .rar金幣: 1下載: 0 次發(fā)布日期: 2023-08-02
- 語言: 其他
- 標(biāo)簽: 網(wǎng)絡(luò)爬蟲??源碼??
資源簡介
自己動手寫網(wǎng)絡(luò)爬蟲完整版 源碼
代碼片段和文件信息
import?java.io.DataOutputStream;
import?java.io.File;
import?java.io.FileOutputStream;
import?java.io.IOException;
import?org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import?org.apache.commons.httpclient.HttpClient;
import?org.apache.commons.httpclient.HttpException;
import?org.apache.commons.httpclient.HttpStatus;
import?org.apache.commons.httpclient.methods.GetMethod;
import?org.apache.commons.httpclient.params.HttpMethodParams;
public?class?DownLoadFile?{
/**
?*?根據(jù)?url?和網(wǎng)頁類型生成需要保存的網(wǎng)頁的文件名?去除掉?url?中非文件名字符
?*/
public??String?getFileNameByUrl(String?urlString?contentType)
{
//remove?http://
url=url.substring(7);
//text/html類型
if(contentType.indexOf(“html“)!=-1)
{
url=?url.replaceAll(“[\\?/:*|<>\“]“?“_“)+“.html“;
return?url;
}
//如application/pdf類型
else
{
??????????return?url.replaceAll(“[\\?/:*|<>\“]“?“_“)+“.“+
??????????contentType.substring(contentType.lastIndexOf(“/“)+1);
}
}
/**
?*?保存網(wǎng)頁字節(jié)數(shù)組到本地文件?filePath?為要保存的文件的相對地址
?*/
private?void?saveToLocal(byte[]?data?String?filePath)?{
try?{
DataOutputStream?out?=?new?DataOutputStream(new?FileOutputStream(
new?File(filePath)));
for?(int?i?=?0;?i? out.write(data[i]);
out.flush();
out.close();
}?catch?(IOException?e)?{
e.printStackTrace();
}
}
/*?下載?url?指向的網(wǎng)頁?*/
public?String?downloadFile(String?url)?{
String?filePath?=?null;
/*?1.生成?HttpClinet?對象并設(shè)置參數(shù)?*/
HttpClient?httpClient?=?new?HttpClient();
//?設(shè)置?Http?連接超時?5s
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(
5000);
/*?2.生成?GetMethod?對象并設(shè)置參數(shù)?*/
GetMethod?getMethod?=?new?GetMethod(url);
//?設(shè)置?get?請求超時?5s
getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT?5000);
//?設(shè)置請求重試處理
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER
new?DefaultHttpMethodRetryHandler());
/*?3.執(zhí)行?HTTP?GET?請求?*/
try?{
int?statusCode?=?httpClient.executeMethod(getMethod);
//?判斷訪問的狀態(tài)碼
if?(statusCode?!=?HttpStatus.SC_OK)?{
System.err.println(“Method?failed:?“
+?getMethod.getStatusLine());
filePath?=?null;
}
/*?4.處理?HTTP?響應(yīng)內(nèi)容?*/
byte[]?responseBody?=?getMethod.getResponseBody();//?讀取為字節(jié)數(shù)組
//?根據(jù)網(wǎng)頁?url?生成保存時的文件名
filePath?=?“temp\\“
+?getFileNameByUrl(url?getMethod.getResponseHeader(
“Content-Type“).getValue());
saveToLocal(responseBody?filePath);
}?catch?(HttpException?e)?{
//?發(fā)生致命的異常,可能是協(xié)議不對或者返回的內(nèi)容有問題
System.out.println(“Please?check?your?provided?http?address!“);
e.printStackTrace();
}?catch?(IOException?e)?{
//?發(fā)生網(wǎng)絡(luò)異常
e.printStackTrace();
}?finally?{
//?釋放連接
getMethod.releaseConnection();
}
return?filePath;
}
}
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????4478??2010-06-28?13:46??Chap04\Channelli
?????文件??????17812??2010-06-28?13:46??Chap04\ExtractContext.java
?????文件????????825??2010-04-24?11:08??Chap04\HtmlParser\.classpath
?????文件????????386??2010-05-04?08:52??Chap04\HtmlParser\.project
?????文件????????629??2010-04-19?16:37??Chap04\HtmlParser\.settings\org.eclipse.jdt.core.prefs
?????文件??????11334??2010-04-26?15:54??Chap04\HtmlParser\src\com\lietu\htmlParser\HtmlParser.java
?????文件???????8312??2006-09-23?14:25??Chap04\HtmlParser\src\doc-files\building.html
?????文件???????5074??2006-09-17?12:47??Chap04\HtmlParser\src\doc-files\overview.html
?????文件???????4896??2006-09-17?13:00??Chap04\HtmlParser\src\doc-files\using.html
?????文件??????26096??2006-09-17?07:24??Chap04\HtmlParser\src\org\htmlparser\Attribute.java
?????文件??????10617??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\BeanyBaby.form
?????文件??????13209??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\BeanyBaby.java
?????文件??????13762??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\FilterBean.java
?????文件???????6547??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\HTMLli
?????文件???????9020??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\HTMLTextBean.java
?????文件????????213??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\images\Chain16.gif
?????文件????????278??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\images\Chain32.gif
?????文件????????140??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\images\Knot16.gif
?????文件????????167??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\images\Knot32.gif
?????文件???????8602??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\li
?????文件???????2188??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\package.html
?????文件??????23110??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\StringBean.java
?????文件???????3485??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\AndFilter.java
?????文件??????13731??2006-09-22?20:26??Chap04\HtmlParser\src\org\htmlparser\filters\CssSelectorNodeFilter.java
?????文件???????4224??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\HasAttributeFilter.java
?????文件???????5213??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\HasChildFilter.java
?????文件???????4821??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\HasParentFilter.java
?????文件???????3556??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\HasSiblingFilter.java
?????文件???????1825??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\IsEqualFilter.java
?????文件???????3184??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\li
............此處省略2237個文件信息
評論
共有 條評論