Backend Development 19 min read

Java Web Crawler for Downloading Sogou Image Search Results

This article demonstrates how to build a Java-based web crawler that fetches image URLs from Sogou Image Search, parses the JSON response, and downloads thousands of pictures to a local directory using multithreaded processing and custom HTTP utilities.

Java Captain
Java Captain
Java Captain
Java Web Crawler for Downloading Sogou Image Search Results

The goal of this tutorial is to crawl Sogou Image Search, retrieve thousands of picture URLs matching a keyword (e.g., "美女"), and save the images locally.

Preparation involves constructing the request URL, for example: https://pic.sogou.com/pics?query=%E7%BE%8E%E5%A5%B3

By opening the page in a browser and inspecting the Network → XHR panel, you can see the API request used to fetch image data. Important query parameters include start (starting index), xml_len (number of items per request), and query (search keyword).

The implementation follows four main steps: (1) set the URL request parameters; (2) send the request and extract the picUrl field from the JSON response; (3) store all URLs in a list; (4) iterate the list and download each image using a thread pool for concurrency.

Below are the complete Java source files used in the project.

import com.alibaba.fastjson.JSONObject; import us.codecraft.webmagic.utils.HttpClientUtils; import victor.chang.crawler.pipeline.SougouImgPipeline; import java.util.ArrayList; import java.util.List; /** * A simple PageProcessor. * @author [email protected] * @since 0.1.0 */ public class SougouImgProcessor { private String url; private SougouImgPipeline pipeline; private List dataList; private List urlList; private String word; public SougouImgProcessor(String url,String word) { this.url = url; this.word = word; this.pipeline = new SougouImgPipeline(); this.dataList = new ArrayList<>(); this.urlList = new ArrayList<>(); } public void process(int idx, int size) { String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word)); JSONObject object = JSONObject.parseObject(res); List items = (List )((JSONObject)object.get("data")).get("items"); for(JSONObject item : items){ this.urlList.add(item.getString("picUrl")); } this.dataList.addAll(items); } // 下载 public void pipelineData(){ //pipeline.process(this.urlList, word); // 单线程 pipeline.processSync(this.urlList, this.word); // 多线程 } public static void main(String[] args) { String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s"; SougouImgProcessor processor = new SougouImgProcessor(url,"美女"); int start = 0, size = 50, limit = 1000; // 定义爬取开始索引、每次爬取数量、总共爬取数量 for(int i=start;i

import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import java.util.List; import java.util.Objects; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; /** * Store results in files. * @author [email protected] * @since 0.1.0 */ public class SougouImgPipeline { private String extension = ".jpg"; private String path; private volatile AtomicInteger suc; private volatile AtomicInteger fails; public SougouImgPipeline() { setPath("E:/pipeline/sougou"); suc = new AtomicInteger(); fails = new AtomicInteger(); } public SougouImgPipeline(String path) { setPath(path); suc = new AtomicInteger(); fails = new AtomicInteger(); } public SougouImgPipeline(String path, String extension) { setPath(path); this.extension = extension; suc = new AtomicInteger(); fails = new AtomicInteger(); } public void setPath(String path) { this.path = path; } /** * 下载 * @param url * @param cate * @throws Exception */ private void downloadImg(String url, String cate, String name) throws Exception { String path = this.path + "/" + cate + "/"; File dir = new File(path); if (!dir.exists()) { // 目录不存在则创建目录 dir.mkdirs(); } String realExt = url.substring(url.lastIndexOf(".")); // 获取扩展名 String fileName = name + realExt; fileName = fileName.replace("-", ""); String filePath = path + fileName; File img = new File(filePath); if(img.exists()){ // 若文件之前已经下载过,则跳过 System.out.println(String.format("文件%s已存在本地目录",fileName)); return; } URLConnection con = new URL(url).openConnection(); con.setConnectTimeout(5000); con.setReadTimeout(5000); InputStream inputStream = con.getInputStream(); byte[] bs = new byte[1024]; File file = new File(filePath); FileOutputStream os = new FileOutputStream(file, true); // 开始读取 写入 int len; while ((len = inputStream.read(bs)) != -1) { os.write(bs, 0, len); } System.out.println("picUrl: " + url); System.out.println(String.format("正在下载第%s张图片", suc.getAndIncrement())); } /** * 单线程处理 * @param data * @param word */ public void process(List data, String word) { long start = System.currentTimeMillis(); for (String picUrl : data) { if (picUrl == null) continue; try { downloadImg(picUrl, word, picUrl); } catch (Exception e) { fails.incrementAndGet(); } } System.out.println("下载成功: " + suc.get()); System.out.println("下载失败: " + fails.get()); long end = System.currentTimeMillis(); System.out.println("耗时:" + (end - start) / 1000 + "秒"); } /** * 多线程处理 * @param data * @param word */ public void processSync(List data, String word) { long start = System.currentTimeMillis(); int count = 0; ExecutorService executorService = Executors.newCachedThreadPool(); // 创建缓存线程池 for (int i=0;i { try { downloadImg(picUrl, word, finalName); } catch (Exception e) { fails.incrementAndGet(); } }); count++; } executorService.shutdown(); try { if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) { // 超时的时候向线程池中所有的线程发出中断(interrupted)。 } System.out.println("AwaitTermination Finished"); System.out.println("共有URL: "+data.size()); System.out.println("下载成功: " + suc); System.out.println("下载失败: " + fails); File dir = new File(this.path + "/" + word + "/"); int len = Objects.requireNonNull(dir.list()).length; System.out.println("当前共有文件:"+len); long end = System.currentTimeMillis(); System.out.println("耗时:" + (end - start) / 1000.0 + "秒"); } catch (InterruptedException e) { e.printStackTrace(); } } /** * 多线程分段处理 * @param data * @param word * @param threadNum */ public void processSync2(List data, final String word, int threadNum) { if (data.size() < threadNum) { process(data, word); } else { ExecutorService executorService = Executors.newCachedThreadPool(); int num = data.size() / threadNum; //每段要处理的数量 for (int i = 0; i < threadNum; i++) { int start = i * num; int end = (i + 1) * num; if (i == threadNum - 1) { end = data.size(); } final List cutList = data.subList(start, end); executorService.execute(() -> process(cutList, word)); } executorService.shutdown(); } } }

import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.NameValuePair; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.conn.ssl.TrustStrategy; import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.message.BasicNameValuePair; import org.apache.http.ssl.SSLContextBuilder; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSession; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.security.GeneralSecurityException; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * @author [email protected] * Date: 17/3/27 */ public abstract class HttpClientUtils { public static Map > convertHeaders(Header[] headers) { Map > results = new HashMap<>(); for (Header header : headers) { List list = results.get(header.getName()); if (list == null) { list = new ArrayList<>(); results.put(header.getName(), list); } list.add(header.getValue()); } return results; } /** http的get请求 */ public static String get(String url) { return get(url, "UTF-8"); } public static Logger logger = LoggerFactory.getLogger(HttpClientUtils.class); /** http的get请求 */ public static String get(String url, String charset) { HttpGet httpGet = new HttpGet(url); return executeRequest(httpGet, charset); } /** http的get请求,增加异步请求头参数 */ public static String ajaxGet(String url) { return ajaxGet(url, "UTF-8"); } /** http的get请求,增加异步请求头参数 */ public static String ajaxGet(String url, String charset) { HttpGet httpGet = new HttpGet(url); httpGet.setHeader("X-Requested-With", "XMLHttpRequest"); return executeRequest(httpGet, charset); } /** http的post请求,传递map格式参数 */ public static String post(String url, Map dataMap) { return post(url, dataMap, "UTF-8"); } /** http的post请求,传递map格式参数 */ public static String post(String url, Map dataMap, String charset) { HttpPost httpPost = new HttpPost(url); try { if (dataMap != null) { List nvps = new ArrayList<>(); for (Map.Entry entry : dataMap.entrySet()) { nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue())); } UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset); formEntity.setContentEncoding(charset); httpPost.setEntity(formEntity); } } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return executeRequest(httpPost, charset); } /** http的post请求,增加异步请求头参数,传递map格式参数 */ public static String ajaxPost(String url, Map dataMap) { return ajaxPost(url, dataMap, "UTF-8"); } /** http的post请求,增加异步请求头参数,传递map格式参数 */ public static String ajaxPost(String url, Map dataMap, String charset) { HttpPost httpPost = new HttpPost(url); httpPost.setHeader("X-Requested-With", "XMLHttpRequest"); try { if (dataMap != null) { List nvps = new ArrayList<>(); for (Map.Entry entry : dataMap.entrySet()) { nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue())); } UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset); formEntity.setContentEncoding(charset); httpPost.setEntity(formEntity); } } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return executeRequest(httpPost, charset); } /** http的post请求,增加异步请求头参数,传递json格式参数 */ public static String ajaxPostJson(String url, String jsonString) { return ajaxPostJson(url, jsonString, "UTF-8"); } /** http的post请求,增加异步请求头参数,传递json格式参数 */ public static String ajaxPostJson(String url, String jsonString, String charset) { HttpPost httpPost = new HttpPost(url); httpPost.setHeader("X-Requested-With", "XMLHttpRequest"); StringEntity stringEntity = new StringEntity(jsonString, charset); // 解决中文乱码问题 stringEntity.setContentEncoding(charset); stringEntity.setContentType("application/json"); httpPost.setEntity(stringEntity); return executeRequest(httpPost, charset); } /** 执行一个http请求,传递HttpGet或HttpPost参数 */ public static String executeRequest(HttpUriRequest httpRequest) { return executeRequest(httpRequest, "UTF-8"); } /** 执行一个http请求,传递HttpGet或HttpPost参数 */ public static String executeRequest(HttpUriRequest httpRequest, String charset) { CloseableHttpClient httpclient; if ("https".equals(httpRequest.getURI().getScheme())) { httpclient = createSSLInsecureClient(); } else { httpclient = HttpClients.createDefault(); } String result = ""; try { try { CloseableHttpResponse response = httpclient.execute(httpRequest); HttpEntity entity = null; try { entity = response.getEntity(); result = EntityUtils.toString(entity, charset); } finally { EntityUtils.consume(entity); response.close(); } } finally { httpclient.close(); } } catch (IOException ex) { ex.printStackTrace(); } return result; } public static String executeRequest(CloseableHttpClient httpclient, HttpUriRequest httpRequest, String charset) { String result = ""; try { try { CloseableHttpResponse response = httpclient.execute(httpRequest); HttpEntity entity = null; try { entity = response.getEntity(); result = EntityUtils.toString(entity, charset); } finally { EntityUtils.consume(entity); response.close(); } } finally { httpclient.close(); } } catch (IOException ex) { ex.printStackTrace(); } return result; } /** 创建 SSL连接 */ public static CloseableHttpClient createSSLInsecureClient() { try { SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(new TrustStrategy() { @Override public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException { return true; } }).build(); SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, new HostnameVerifier() { @Override public boolean verify(String hostname, SSLSession session) { return true; } }); return HttpClients.custom().setSSLSocketFactory(sslsf).build(); } catch (GeneralSecurityException ex) { throw new RuntimeException(ex); } } }

Running the program may not download every image on the first attempt due to network instability; re‑executing the crawler several times improves the overall success rate.

Feel free to star or like the post if you find the guide helpful.

JavaHTTPMultithreadingWeb ScrapingImage DownloadSogou
Java Captain
Written by

Java Captain

Focused on Java technologies: SSM, the Spring ecosystem, microservices, MySQL, MyCat, clustering, distributed systems, middleware, Linux, networking, multithreading; occasionally covers DevOps tools like Jenkins, Nexus, Docker, ELK; shares practical tech insights and is dedicated to full‑stack Java development.

0 followers
Reader feedback

How this landed with the community

login Sign in to like

Rate this article

Was this worth your time?

Sign in to rate
Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.