How to Crawl and Download Thousands of Sogou Images Using Java

This guide explains how to scrape thousands of images from Sogou by analyzing the request URL, extracting image URLs from JSON responses, and implementing a multithreaded Java downloader with custom HTTP utilities and pipelines to store the pictures locally.

Java Backend Technology
Java Backend Technology
Java Backend Technology
How to Crawl and Download Thousands of Sogou Images Using Java

Purpose

Crawl Sogou image search for thousands of pictures of a given keyword (e.g., "美女") and download them to the local file system.

Preparation

Target URL:

https://pic.sogou.com/pics?query=美女

Analysis

Open the page, use the browser developer tools (Network → XHR) and scroll down to see the request URL:

Request URL:

https://pic.sogou.com/napi/pc/searchList?mode=1&start=48&xml_len=48&query=美女

Key parameters: start=48 – start index of images xml_len=48 – number of images to fetch per request query=美女 – search keyword (URL‑encoded automatically)

The JSON response contains the image URLs in the picUrl field.

Approach

1. Build the request URL with the required parameters. 2. Send HTTP GET requests to obtain the JSON data. 3. Extract picUrl values and store them in a list. 4. Use a thread pool to download each image concurrently and save them locally.

Code

import com.alibaba.fastjson.JSONObject;
import us.codecraft.webmagic.utils.HttpClientUtils;
import victor.chang.crawler.pipeline.SougouImgPipeline;
import java.util.ArrayList;
import java.util.List;

/**
 * A simple PageProcessor.
 */
public class SougouImgProcessor {
    private String url;
    private SougouImgPipeline pipeline;
    private List<JSONObject> dataList;
    private List<String> urlList;
    private String word;

    public SougouImgProcessor(String url, String word) {
        this.url = url;
        this.word = word;
        this.pipeline = new SougouImgPipeline();
        this.dataList = new ArrayList<>();
        this.urlList = new ArrayList<>();
    }

    public void process(int idx, int size) {
        String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word));
        JSONObject object = JSONObject.parseObject(res);
        List<JSONObject> items = (List<JSONObject>)((JSONObject)object.get("data")).get("items");
        for (JSONObject item : items) {
            this.urlList.add(item.getString("picUrl"));
        }
        this.dataList.addAll(items);
    }

    // Download
    public void pipelineData() {
        // Multi‑threaded download
        pipeline.processSync(this.urlList, this.word);
    }

    public static void main(String[] args) {
        String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";
        SougouImgProcessor processor = new SougouImgProcessor(url, "美女");
        int start = 0, size = 50, limit = 1000; // start index, batch size, total number
        for (int i = start; i < start + limit; i += size) {
            processor.process(i, size);
        }
        processor.pipelineData();
    }
}
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * Store results in files.
 */
public class SougouImgPipeline {
    private String extension = ".jpg";
    private String path;
    private volatile AtomicInteger suc;
    private volatile AtomicInteger fails;

    public SougouImgPipeline() {
        setPath("E:/pipeline/sougou");
        suc = new AtomicInteger();
        fails = new AtomicInteger();
    }

    public SougouImgPipeline(String path) {
        setPath(path);
        suc = new AtomicInteger();
        fails = new AtomicInteger();
    }

    public void setPath(String path) {
        this.path = path;
    }

    /** Download a single image */
    private void downloadImg(String url, String cate, String name) throws Exception {
        String dirPath = this.path + "/" + cate + "/";
        File dir = new File(dirPath);
        if (!dir.exists()) {
            dir.mkdirs();
        }
        String realExt = url.substring(url.lastIndexOf("."));
        String fileName = name + realExt;
        fileName = fileName.replace("-", "");
        String filePath = dirPath + fileName;
        File img = new File(filePath);
        if (img.exists()) {
            System.out.println(String.format("File %s already exists", fileName));
            return;
        }
        URLConnection con = new URL(url).openConnection();
        con.setConnectTimeout(5000);
        con.setReadTimeout(5000);
        InputStream inputStream = con.getInputStream();
        byte[] bs = new byte[1024];
        FileOutputStream os = new FileOutputStream(img, true);
        int len;
        while ((len = inputStream.read(bs)) != -1) {
            os.write(bs, 0, len);
        }
        System.out.println("picUrl: " + url);
        System.out.println(String.format("Downloading image %s", suc.getAndIncrement()));
    }

    public void process(List<String> data, String word) {
        long start = System.currentTimeMillis();
        for (String picUrl : data) {
            if (picUrl == null) continue;
            try {
                downloadImg(picUrl, word, picUrl);
            } catch (Exception e) {
                fails.incrementAndGet();
            }
        }
        System.out.println("Success: " + suc.get());
        System.out.println("Failed: " + fails.get());
        System.out.println("Time: " + (System.currentTimeMillis() - start) / 1000 + "s");
    }

    public void processSync(List<String> data, String word) {
        long start = System.currentTimeMillis();
        ExecutorService executor = Executors.newCachedThreadPool();
        for (int i = 0; i < data.size(); i++) {
            String picUrl = data.get(i);
            if (picUrl == null) continue;
            String name = String.format("%03d", i);
            executor.execute(() -> {
                try {
                    downloadImg(picUrl, word, name);
                } catch (Exception e) {
                    fails.incrementAndGet();
                }
            });
        }
        executor.shutdown();
        try {
            if (!executor.awaitTermination(60, TimeUnit.SECONDS)) {
                // timeout handling
            }
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        System.out.println("Total URLs: " + data.size());
        System.out.println("Success: " + suc);
        System.out.println("Failed: " + fails);
        System.out.println("Time: " + (System.currentTimeMillis() - start) / 1000.0 + "s");
    }
}
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.security.GeneralSecurityException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * HTTP request utility class.
 */
public abstract class HttpClientUtils {
    public static Map<String, List<String>> convertHeaders(Header[] headers) {
        Map<String, List<String>> results = new HashMap<>();
        for (Header header : headers) {
            List<String> list = results.get(header.getName());
            if (list == null) {
                list = new ArrayList<>();
                results.put(header.getName(), list);
            }
            list.add(header.getValue());
        }
        return results;
    }

    public static String get(String url) {
        return get(url, "UTF-8");
    }

    public static String get(String url, String charset) {
        HttpGet httpGet = new HttpGet(url);
        return executeRequest(httpGet, charset);
    }

    public static String ajaxGet(String url) {
        return ajaxGet(url, "UTF-8");
    }

    public static String ajaxGet(String url, String charset) {
        HttpGet httpGet = new HttpGet(url);
        httpGet.setHeader("X-Requested-With", "XMLHttpRequest");
        return executeRequest(httpGet, charset);
    }

    public static String post(String url, Map<String, String> dataMap) {
        return post(url, dataMap, "UTF-8");
    }

    public static String post(String url, Map<String, String> dataMap, String charset) {
        HttpPost httpPost = new HttpPost(url);
        try {
            if (dataMap != null) {
                List<NameValuePair> nvps = new ArrayList<>();
                for (Map.Entry<String, String> entry : dataMap.entrySet()) {
                    nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
                }
                UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);
                formEntity.setContentEncoding(charset);
                httpPost.setEntity(formEntity);
            }
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return executeRequest(httpPost, charset);
    }

    public static String ajaxPost(String url, Map<String, String> dataMap) {
        return ajaxPost(url, dataMap, "UTF-8");
    }

    public static String ajaxPost(String url, Map<String, String> dataMap, String charset) {
        HttpPost httpPost = new HttpPost(url);
        httpPost.setHeader("X-Requested-With", "XMLHttpRequest");
        try {
            if (dataMap != null) {
                List<NameValuePair> nvps = new ArrayList<>();
                for (Map.Entry<String, String> entry : dataMap.entrySet()) {
                    nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
                }
                UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);
                formEntity.setContentEncoding(charset);
                httpPost.setEntity(formEntity);
            }
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return executeRequest(httpPost, charset);
    }

    public static String ajaxPostJson(String url, String jsonString) {
        return ajaxPostJson(url, jsonString, "UTF-8");
    }

    public static String ajaxPostJson(String url, String jsonString, String charset) {
        HttpPost httpPost = new HttpPost(url);
        httpPost.setHeader("X-Requested-With", "XMLHttpRequest");
        StringEntity stringEntity = new StringEntity(jsonString, charset);
        stringEntity.setContentEncoding(charset);
        stringEntity.setContentType("application/json");
        httpPost.setEntity(stringEntity);
        return executeRequest(httpPost, charset);
    }

    public static String executeRequest(HttpUriRequest httpRequest) {
        return executeRequest(httpRequest, "UTF-8");
    }

    public static String executeRequest(HttpUriRequest httpRequest, String charset) {
        CloseableHttpClient httpclient;
        if ("https".equals(httpRequest.getURI().getScheme())) {
            httpclient = createSSLInsecureClient();
        } else {
            httpclient = HttpClients.createDefault();
        }
        String result = "";
        try {
            try (CloseableHttpResponse response = httpclient.execute(httpRequest)) {
                HttpEntity entity = response.getEntity();
                result = EntityUtils.toString(entity, charset);
                EntityUtils.consume(entity);
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        } finally {
            try { httpclient.close(); } catch (IOException ignored) {}
        }
        return result;
    }

    public static CloseableHttpClient createSSLInsecureClient() {
        try {
            SSLContext sslContext = new SSLContextBuilder()
                .loadTrustMaterial((X509Certificate[] chain, String authType) -> true)
                .build();
            SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, (hostname, session) -> true);
            return HttpClients.custom().setSSLSocketFactory(sslsf).build();
        } catch (GeneralSecurityException ex) {
            throw new RuntimeException(ex);
        }
    }
}

Run

Network issues may cause some downloads to fail; re‑run the program or increase the retry count to improve the success rate.

Original Source

Signed-in readers can open the original source through BestHub's protected redirect.

Sign in to view source
Republication Notice

This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactadmin@besthub.devand we will review it promptly.

JavaWeb CrawlingImage DownloadSogou
Java Backend Technology
Written by

Java Backend Technology

Focus on Java-related technologies: SSM, Spring ecosystem, microservices, MySQL, MyCat, clustering, distributed systems, middleware, Linux, networking, multithreading. Occasionally cover DevOps tools like Jenkins, Nexus, Docker, and ELK. Also share technical insights from time to time, committed to Java full-stack development!

0 followers
Reader feedback

How this landed with the community

Sign in to like

Rate this article

Was this worth your time?

Sign in to rate
Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.