1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
| package individual.cy.douban.utils;
import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.ParseException; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class Spider { public static String pickData(String url) { try (CloseableHttpClient client = HttpClients.createDefault()) { HttpGet httpGet = new HttpGet(url); CloseableHttpResponse response = client.execute(httpGet); HttpEntity entity = response.getEntity(); if (entity != null) { return EntityUtils.toString(entity); } } catch (ParseException | IOException e) { e.printStackTrace(); return ""; } return ""; }
public static String pick4data(String url) { RequestConfig config = RequestConfig.custom().setConnectTimeout(3000). setSocketTimeout(3000).build(); HttpGet httpGet = new HttpGet(url); return grab(httpGet,config); }
public static String pick4data(String url, String ip, String port) { System.out.println("此时线程: " + Thread.currentThread().getName() + " 爬取所使用的代理为: " + ip + ":" + port); HttpHost proxy = new HttpHost(ip, Integer.parseInt(port)); RequestConfig config = RequestConfig.custom().setProxy(proxy).setConnectTimeout(3000). setSocketTimeout(3000).build(); HttpGet httpGet = new HttpGet(url); return grab(httpGet,config); }
private static String grab(HttpGet httpGet, RequestConfig config){ httpGet.setConfig(config); httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;" + "q=0.9,image/webp,*/*;q=0.8"); httpGet.setHeader("Accept-Encoding", "gzip, deflate, sdch"); httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8"); httpGet.setHeader("Cache-Control", "no-cache"); httpGet.setHeader("Connection", "keep-alive"); httpGet.setHeader("Host", "www.xicidaili.com"); httpGet.setHeader("Pragma", "no-cache"); httpGet.setHeader("Upgrade-Insecure-Requests", "1"); httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
try (CloseableHttpClient httpClient = HttpClients.createDefault(); CloseableHttpResponse httpResponse = httpClient.execute(httpGet)) { int status = 200; if (httpResponse.getStatusLine().getStatusCode() == status) { HttpEntity entity = httpResponse.getEntity(); if (entity != null) { return EntityUtils.toString(entity, "utf-8"); } } } catch (ParseException | IOException e) { e.printStackTrace(); return ""; } return ""; } }
|