Java Sogou Image Crawler: Fetching and Downloading Images with WebMagic
This tutorial explains how to use Java and the WebMagic framework to crawl thousands of images from Sogou image search, parse the JSON responses to extract image URLs, and download the pictures locally using multithreaded processing and custom HTTP utilities.
Purpose: crawl thousands of images of "美女" from Sogou image search and download them locally.
Preparation: the request URL is https://pic.sogou.com/pics?query=美女 ; the API endpoint for pagination is https://pic.sogou.com/napi/pc/searchList?mode=1&start=48&xml_len=48&query=美女 . Parameters start , xml_len , and query control pagination and keyword.
Analysis: Using browser DevTools (Network → XHR) to capture the JSON response, the image URLs are located in the picUrl field.
Idea: The crawler sets request parameters, fetches JSON, extracts picUrl , stores URLs in a list, and downloads images using a thread pool.
Code: The implementation consists of three classes.
SougouImgProcessor.java – a WebMagic PageProcessor that builds the request URL, parses the JSON with FastJSON, collects image URLs, and delegates downloading to SougouImgPipeline .
import com.alibaba.fastjson.JSONObject;
import us.codecraft.webmagic.utils.HttpClientUtils;
import victor.chang.crawler.pipeline.SougouImgPipeline;
import java.util.ArrayList;
import java.util.List;
/**
* A simple PageProcessor.
*/
public class SougouImgProcessor {
private String url;
private SougouImgPipeline pipeline;
private List
dataList;
private List
urlList;
private String word;
public SougouImgProcessor(String url, String word) {
this.url = url;
this.word = word;
this.pipeline = new SougouImgPipeline();
this.dataList = new ArrayList<>();
this.urlList = new ArrayList<>();
}
public void process(int idx, int size) {
String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word));
JSONObject object = JSONObject.parseObject(res);
List
items = (List
) ((JSONObject) object.get("data")).get("items");
for (JSONObject item : items) {
this.urlList.add(item.getString("picUrl"));
}
this.dataList.addAll(items);
}
// download
public void pipelineData() {
// multi‑thread
pipeline.processSync(this.urlList, this.word);
}
public static void main(String[] args) {
String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";
SougouImgProcessor processor = new SougouImgProcessor(url, "美女");
int start = 0, size = 50, limit = 1000; // start index, batch size, total
for (int i = start; i < start + limit; i += size) {
processor.process(i, size);
}
processor.pipelineData();
}
}SougouImgPipeline.java – handles file creation, directory management, and actual image download. It provides single‑thread, multi‑thread, and segmented multi‑thread methods, using ExecutorService and atomic counters to track success and failures.
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Store results in files.
*/
public class SougouImgPipeline {
private String extension = ".jpg";
private String path;
private volatile AtomicInteger suc;
private volatile AtomicInteger fails;
public SougouImgPipeline() {
setPath("E:/pipeline/sougou");
suc = new AtomicInteger();
fails = new AtomicInteger();
}
public void setPath(String path) { this.path = path; }
private void downloadImg(String url, String cate, String name) throws Exception {
String path = this.path + "/" + cate + "/";
File dir = new File(path);
if (!dir.exists()) { dir.mkdirs(); }
String realExt = url.substring(url.lastIndexOf("."));
String fileName = name + realExt;
fileName = fileName.replace("-", "");
String filePath = path + fileName;
File img = new File(filePath);
if (img.exists()) { System.out.println(String.format("文件%s已存在本地目录", fileName)); return; }
URLConnection con = new URL(url).openConnection();
con.setConnectTimeout(5000);
con.setReadTimeout(5000);
InputStream inputStream = con.getInputStream();
byte[] bs = new byte[1024];
File file = new File(filePath);
FileOutputStream os = new FileOutputStream(file, true);
int len;
while ((len = inputStream.read(bs)) != -1) { os.write(bs, 0, len); }
System.out.println("picUrl: " + url);
System.out.println(String.format("正在下载第%s张图片", suc.getAndIncrement()));
}
public void process(List
data, String word) {
long start = System.currentTimeMillis();
for (String picUrl : data) {
if (picUrl == null) continue;
try { downloadImg(picUrl, word, picUrl); } catch (Exception e) { fails.incrementAndGet(); }
}
System.out.println("下载成功: " + suc.get());
System.out.println("下载失败: " + fails.get());
long end = System.currentTimeMillis();
System.out.println("耗时:" + (end - start) / 1000 + "秒");
}
public void processSync(List
data, String word) {
long start = System.currentTimeMillis();
ExecutorService executorService = Executors.newCachedThreadPool();
for (int i = 0; i < data.size(); i++) {
String picUrl = data.get(i);
if (picUrl == null) continue;
String name = "";
if (i < 10) { name = "000" + i; }
else if (i < 100) { name = "00" + i; }
else if (i < 1000) { name = "0" + i; }
final String finalName = name;
executorService.execute(() -> {
try { downloadImg(picUrl, word, finalName); } catch (Exception e) { fails.incrementAndGet(); }
});
}
executorService.shutdown();
try {
if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) {
// timeout handling
}
System.out.println("AwaitTermination Finished");
System.out.println("共有URL: " + data.size());
System.out.println("下载成功: " + suc);
System.out.println("下载失败: " + fails);
File dir = new File(this.path + "/" + word + "/");
int len = Objects.requireNonNull(dir.list()).length;
System.out.println("当前共有文件: " + len);
long end = System.currentTimeMillis();
System.out.println("耗时:" + (end - start) / 1000.0 + "秒");
} catch (InterruptedException e) { e.printStackTrace(); }
}
// additional segmented method omitted for brevity
}HttpClientUtils.java – a lightweight HTTP utility based on Apache HttpClient, offering GET, POST, AJAX‑style requests, header conversion, and SSL‑insecure client creation.
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import java.io.IOException;
import java.security.GeneralSecurityException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* HTTP utility class.
*/
public abstract class HttpClientUtils {
public static Map
> convertHeaders(Header[] headers) { /* ... */ }
public static String get(String url) { return get(url, "UTF-8"); }
public static String get(String url, String charset) { HttpGet httpGet = new HttpGet(url); return executeRequest(httpGet, charset); }
public static String ajaxGet(String url) { return ajaxGet(url, "UTF-8"); }
public static String ajaxGet(String url, String charset) { HttpGet httpGet = new HttpGet(url); httpGet.setHeader("X-Requested-With", "XMLHttpRequest"); return executeRequest(httpGet, charset); }
public static String post(String url, Map
dataMap) { return post(url, dataMap, "UTF-8"); }
public static String post(String url, Map
dataMap, String charset) { HttpPost httpPost = new HttpPost(url); /* set entity */ return executeRequest(httpPost, charset); }
public static String ajaxPost(String url, Map
dataMap) { return ajaxPost(url, dataMap, "UTF-8"); }
public static String ajaxPost(String url, Map
dataMap, String charset) { HttpPost httpPost = new HttpPost(url); httpPost.setHeader("X-Requested-With", "XMLHttpRequest"); /* set entity */ return executeRequest(httpPost, charset); }
public static String ajaxPostJson(String url, String jsonString) { return ajaxPostJson(url, jsonString, "UTF-8"); }
public static String ajaxPostJson(String url, String jsonString, String charset) { HttpPost httpPost = new HttpPost(url); httpPost.setHeader("X-Requested-With", "XMLHttpRequest"); StringEntity stringEntity = new StringEntity(jsonString, charset); stringEntity.setContentEncoding(charset); stringEntity.setContentType("application/json"); httpPost.setEntity(stringEntity); return executeRequest(httpPost, charset); }
public static String executeRequest(HttpUriRequest httpRequest) { return executeRequest(httpRequest, "UTF-8"); }
public static String executeRequest(HttpUriRequest httpRequest, String charset) { CloseableHttpClient httpclient; if ("https".equals(httpRequest.getURI().getScheme())) { httpclient = createSSLInsecureClient(); } else { httpclient = HttpClients.createDefault(); } String result = ""; try { CloseableHttpResponse response = httpclient.execute(httpRequest); try { HttpEntity entity = response.getEntity(); result = EntityUtils.toString(entity, charset); } finally { EntityUtils.consume(response.getEntity()); response.close(); } } catch (IOException ex) { ex.printStackTrace(); } finally { try { httpclient.close(); } catch (IOException e) { e.printStackTrace(); } } return result; }
public static CloseableHttpClient createSSLInsecureClient() { try { SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial((chain, authType) -> true).build(); SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, (hostname, session) -> true); return HttpClients.custom().setSSLSocketFactory(sslsf).build(); } catch (GeneralSecurityException ex) { throw new RuntimeException(ex); } }
}Run: The main method iterates over pages (start, size, limit), calls process for each batch, and finally invokes pipelineData to download all collected images. Because of network instability, multiple runs may be needed to achieve a high success rate.
Conclusion: The article demonstrates a complete end‑to‑end Java solution for crawling and downloading images from Sogou, illustrating HTTP request handling, JSON parsing, multithreaded I/O, and practical debugging with browser tools.
Java Architect Essentials
Committed to sharing quality articles and tutorials to help Java programmers progress from junior to mid-level to senior architect. We curate high-quality learning resources, interview questions, videos, and projects from across the internet to help you systematically improve your Java architecture skills. Follow and reply '1024' to get Java programming resources. Learn together, grow together.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.