Backend Development 15 min read

Crawling and Downloading Thousands of Images from Sogou Using Java

This article explains how to scrape tens of thousands of images from Sogou by analyzing the request URLs, extracting image URLs from JSON responses, and implementing a multithreaded Java downloader that stores the pictures locally.

Java Architect Essentials
Java Architect Essentials
Java Architect Essentials
Crawling and Downloading Thousands of Images from Sogou Using Java

Purpose Scrape thousands of beautiful images from Sogou and download them to the local machine.

Preparation Target URL: https://pic.sogou.com/pics?query=%E7%BE%8E%E5%A5%B3

Analysis Open the URL in a browser, use the developer tools (Network → XHR) while scrolling to capture the request URL, e.g., https://pic.sogou.com/napi/pc/searchList?mode=1&start=48&xml_len=48&query=%E7%BE%8E%E5%A5%B3 Key parameters: start (starting index), xml_len (number of images per request), and query (search keyword, URL‑encoded).

Inspect the response JSON (using a formatter) to find that the image URLs are stored in the picUrl field.

Approach 1. Set URL request parameters. 2. Send HTTP requests to obtain image URLs. 3. Store the URLs in a list. 4. Use a thread pool to download the images concurrently.

Code

SougouImgProcessor.java (image crawling class):

import com.alibaba.fastjson.JSONObject;
import us.codecraft.webmagic.utils.HttpClientUtils;
import victor.chang.crawler.pipeline.SougouImgPipeline;

import java.util.ArrayList;
import java.util.List;

/**
 * A simple PageProcessor.
 */
public class SougouImgProcessor {

    private String url;
    private SougouImgPipeline pipeline;
    private List
dataList;
    private List
urlList;
    private String word;

    public SougouImgProcessor(String url,String word) {
        this.url = url;
        this.word = word;
        this.pipeline = new SougouImgPipeline();
        this.dataList = new ArrayList<>();
        this.urlList = new ArrayList<>();
    }

    public void process(int idx, int size) {
        String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word));
        JSONObject object = JSONObject.parseObject(res);
        List
items = (List
)((JSONObject)object.get("data")).get("items");
        for(JSONObject item : items){
            this.urlList.add(item.getString("picUrl"));
        }
        this.dataList.addAll(items);
    }

    // download
    public void pipelineData(){
        // multithreaded download
        pipeline.processSync(this.urlList, this.word);
    }

    public static void main(String[] args) {
        String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";
        SougouImgProcessor processor = new SougouImgProcessor(url,"美女");
        int start = 0, size = 50, limit = 1000; // start index, batch size, total limit
        for(int i=start;i

SougouImgPipeline.java (image download class):

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * Store results in files.
 */
public class SougouImgPipeline {

    private String extension = ".jpg";
    private String path;
    private volatile AtomicInteger suc;
    private volatile AtomicInteger fails;

    public SougouImgPipeline() {
        setPath("E:/pipeline/sougou");
        suc = new AtomicInteger();
        fails = new AtomicInteger();
    }

    public SougouImgPipeline(String path) {
        setPath(path);
        suc = new AtomicInteger();
        fails = new AtomicInteger();
    }

    public SougouImgPipeline(String path, String extension) {
        setPath(path);
        this.extension = extension;
        suc = new AtomicInteger();
        fails = new AtomicInteger();
    }

    public void setPath(String path) {
        this.path = path;
    }

    private void downloadImg(String url, String cate, String name) throws Exception {
        String path = this.path + "/" + cate + "/";
        File dir = new File(path);
        if (!dir.exists()) {    // create directory if missing
            dir.mkdirs();
        }
        String realExt = url.substring(url.lastIndexOf("."));   // file extension
        String fileName = name + realExt;
        fileName = fileName.replace("-", "");
        String filePath = path + fileName;
        File img = new File(filePath);
        if(img.exists()){   // skip if already downloaded
            System.out.println(String.format("文件%s已存在本地目录",fileName));
            return;
        }
        URLConnection con = new URL(url).openConnection();
        con.setConnectTimeout(5000);
        con.setReadTimeout(5000);
        InputStream inputStream = con.getInputStream();
        byte[] bs = new byte[1024];
        FileOutputStream os = new FileOutputStream(filePath, true);
        int len;
        while ((len = inputStream.read(bs)) != -1) {
            os.write(bs, 0, len);
        }
        System.out.println("picUrl: " + url);
        System.out.println(String.format("正在下载第%s张图片", suc.getAndIncrement()));
    }

    public void process(List
data, String word) {
        long start = System.currentTimeMillis();
        for (String picUrl : data) {
            if (picUrl == null) continue;
            try {
                downloadImg(picUrl, word, picUrl);
            } catch (Exception e) {
                fails.incrementAndGet();
            }
        }
        System.out.println("下载成功: " + suc.get());
        System.out.println("下载失败: " + fails.get());
        long end = System.currentTimeMillis();
        System.out.println("耗时:" + (end - start) / 1000 + "秒");
    }

    public void processSync(List
data, String word) {
        long start = System.currentTimeMillis();
        ExecutorService executorService = Executors.newCachedThreadPool();
        for (int i = 0; i < data.size(); i++) {
            String picUrl = data.get(i);
            if (picUrl == null) continue;
            String name = "";
            if (i < 10) name = "000" + i;
            else if (i < 100) name = "00" + i;
            else if (i < 1000) name = "0" + i;
            final String finalName = name;
            executorService.execute(() -> {
                try {
                    downloadImg(picUrl, word, finalName);
                } catch (Exception e) {
                    fails.incrementAndGet();
                }
            });
        }
        executorService.shutdown();
        try {
            if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) {
                // timeout handling
            }
            System.out.println("AwaitTermination Finished");
            System.out.println("共有URL: " + data.size());
            System.out.println("下载成功: " + suc);
            System.out.println("下载失败: " + fails);
            File dir = new File(this.path + "/" + word + "/");
            int len = Objects.requireNonNull(dir.list()).length;
            System.out.println("当前共有文件: " + len);
            long end = System.currentTimeMillis();
            System.out.println("耗时:" + (end - start) / 1000.0 + "秒");
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}

HttpClientUtils.java (HTTP request utility class) – provides GET, POST, AJAX helpers and SSL‑ignoring client creation. The class is omitted for brevity but is essential for the crawler.

Execution Running the program may not download every image on the first try due to network issues; re‑executing the program improves the success rate.

Finally, the article encourages readers to share the tutorial and join a technical community for further learning.

JavaHTTPMultithreadingweb scrapingImage DownloadSogou
Java Architect Essentials
Written by

Java Architect Essentials

Committed to sharing quality articles and tutorials to help Java programmers progress from junior to mid-level to senior architect. We curate high-quality learning resources, interview questions, videos, and projects from across the internet to help you systematically improve your Java architecture skills. Follow and reply '1024' to get Java programming resources. Learn together, grow together.

0 followers
Reader feedback

How this landed with the community

login Sign in to like

Rate this article

Was this worth your time?

Sign in to rate
Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.