How to Crawl and Download Thousands of Sogou Images Using Java
This guide explains how to scrape thousands of images from Sogou by analyzing the request URL, extracting image URLs from JSON responses, and implementing a multithreaded Java downloader with custom HTTP utilities and pipelines to store the pictures locally.
Purpose
Crawl Sogou image search for thousands of pictures of a given keyword (e.g., "美女") and download them to the local file system.
Preparation
Target URL:
https://pic.sogou.com/pics?query=美女Analysis
Open the page, use the browser developer tools (Network → XHR) and scroll down to see the request URL:
Request URL:
https://pic.sogou.com/napi/pc/searchList?mode=1&start=48&xml_len=48&query=美女Key parameters: start=48 – start index of images xml_len=48 – number of images to fetch per request query=美女 – search keyword (URL‑encoded automatically)
The JSON response contains the image URLs in the picUrl field.
Approach
1. Build the request URL with the required parameters. 2. Send HTTP GET requests to obtain the JSON data. 3. Extract picUrl values and store them in a list. 4. Use a thread pool to download each image concurrently and save them locally.
Code
import com.alibaba.fastjson.JSONObject;
import us.codecraft.webmagic.utils.HttpClientUtils;
import victor.chang.crawler.pipeline.SougouImgPipeline;
import java.util.ArrayList;
import java.util.List;
/**
* A simple PageProcessor.
*/
public class SougouImgProcessor {
private String url;
private SougouImgPipeline pipeline;
private List<JSONObject> dataList;
private List<String> urlList;
private String word;
public SougouImgProcessor(String url, String word) {
this.url = url;
this.word = word;
this.pipeline = new SougouImgPipeline();
this.dataList = new ArrayList<>();
this.urlList = new ArrayList<>();
}
public void process(int idx, int size) {
String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word));
JSONObject object = JSONObject.parseObject(res);
List<JSONObject> items = (List<JSONObject>)((JSONObject)object.get("data")).get("items");
for (JSONObject item : items) {
this.urlList.add(item.getString("picUrl"));
}
this.dataList.addAll(items);
}
// Download
public void pipelineData() {
// Multi‑threaded download
pipeline.processSync(this.urlList, this.word);
}
public static void main(String[] args) {
String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";
SougouImgProcessor processor = new SougouImgProcessor(url, "美女");
int start = 0, size = 50, limit = 1000; // start index, batch size, total number
for (int i = start; i < start + limit; i += size) {
processor.process(i, size);
}
processor.pipelineData();
}
} import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Store results in files.
*/
public class SougouImgPipeline {
private String extension = ".jpg";
private String path;
private volatile AtomicInteger suc;
private volatile AtomicInteger fails;
public SougouImgPipeline() {
setPath("E:/pipeline/sougou");
suc = new AtomicInteger();
fails = new AtomicInteger();
}
public SougouImgPipeline(String path) {
setPath(path);
suc = new AtomicInteger();
fails = new AtomicInteger();
}
public void setPath(String path) {
this.path = path;
}
/** Download a single image */
private void downloadImg(String url, String cate, String name) throws Exception {
String dirPath = this.path + "/" + cate + "/";
File dir = new File(dirPath);
if (!dir.exists()) {
dir.mkdirs();
}
String realExt = url.substring(url.lastIndexOf("."));
String fileName = name + realExt;
fileName = fileName.replace("-", "");
String filePath = dirPath + fileName;
File img = new File(filePath);
if (img.exists()) {
System.out.println(String.format("File %s already exists", fileName));
return;
}
URLConnection con = new URL(url).openConnection();
con.setConnectTimeout(5000);
con.setReadTimeout(5000);
InputStream inputStream = con.getInputStream();
byte[] bs = new byte[1024];
FileOutputStream os = new FileOutputStream(img, true);
int len;
while ((len = inputStream.read(bs)) != -1) {
os.write(bs, 0, len);
}
System.out.println("picUrl: " + url);
System.out.println(String.format("Downloading image %s", suc.getAndIncrement()));
}
public void process(List<String> data, String word) {
long start = System.currentTimeMillis();
for (String picUrl : data) {
if (picUrl == null) continue;
try {
downloadImg(picUrl, word, picUrl);
} catch (Exception e) {
fails.incrementAndGet();
}
}
System.out.println("Success: " + suc.get());
System.out.println("Failed: " + fails.get());
System.out.println("Time: " + (System.currentTimeMillis() - start) / 1000 + "s");
}
public void processSync(List<String> data, String word) {
long start = System.currentTimeMillis();
ExecutorService executor = Executors.newCachedThreadPool();
for (int i = 0; i < data.size(); i++) {
String picUrl = data.get(i);
if (picUrl == null) continue;
String name = String.format("%03d", i);
executor.execute(() -> {
try {
downloadImg(picUrl, word, name);
} catch (Exception e) {
fails.incrementAndGet();
}
});
}
executor.shutdown();
try {
if (!executor.awaitTermination(60, TimeUnit.SECONDS)) {
// timeout handling
}
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println("Total URLs: " + data.size());
System.out.println("Success: " + suc);
System.out.println("Failed: " + fails);
System.out.println("Time: " + (System.currentTimeMillis() - start) / 1000.0 + "s");
}
} import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.security.GeneralSecurityException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* HTTP request utility class.
*/
public abstract class HttpClientUtils {
public static Map<String, List<String>> convertHeaders(Header[] headers) {
Map<String, List<String>> results = new HashMap<>();
for (Header header : headers) {
List<String> list = results.get(header.getName());
if (list == null) {
list = new ArrayList<>();
results.put(header.getName(), list);
}
list.add(header.getValue());
}
return results;
}
public static String get(String url) {
return get(url, "UTF-8");
}
public static String get(String url, String charset) {
HttpGet httpGet = new HttpGet(url);
return executeRequest(httpGet, charset);
}
public static String ajaxGet(String url) {
return ajaxGet(url, "UTF-8");
}
public static String ajaxGet(String url, String charset) {
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("X-Requested-With", "XMLHttpRequest");
return executeRequest(httpGet, charset);
}
public static String post(String url, Map<String, String> dataMap) {
return post(url, dataMap, "UTF-8");
}
public static String post(String url, Map<String, String> dataMap, String charset) {
HttpPost httpPost = new HttpPost(url);
try {
if (dataMap != null) {
List<NameValuePair> nvps = new ArrayList<>();
for (Map.Entry<String, String> entry : dataMap.entrySet()) {
nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);
formEntity.setContentEncoding(charset);
httpPost.setEntity(formEntity);
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return executeRequest(httpPost, charset);
}
public static String ajaxPost(String url, Map<String, String> dataMap) {
return ajaxPost(url, dataMap, "UTF-8");
}
public static String ajaxPost(String url, Map<String, String> dataMap, String charset) {
HttpPost httpPost = new HttpPost(url);
httpPost.setHeader("X-Requested-With", "XMLHttpRequest");
try {
if (dataMap != null) {
List<NameValuePair> nvps = new ArrayList<>();
for (Map.Entry<String, String> entry : dataMap.entrySet()) {
nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);
formEntity.setContentEncoding(charset);
httpPost.setEntity(formEntity);
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return executeRequest(httpPost, charset);
}
public static String ajaxPostJson(String url, String jsonString) {
return ajaxPostJson(url, jsonString, "UTF-8");
}
public static String ajaxPostJson(String url, String jsonString, String charset) {
HttpPost httpPost = new HttpPost(url);
httpPost.setHeader("X-Requested-With", "XMLHttpRequest");
StringEntity stringEntity = new StringEntity(jsonString, charset);
stringEntity.setContentEncoding(charset);
stringEntity.setContentType("application/json");
httpPost.setEntity(stringEntity);
return executeRequest(httpPost, charset);
}
public static String executeRequest(HttpUriRequest httpRequest) {
return executeRequest(httpRequest, "UTF-8");
}
public static String executeRequest(HttpUriRequest httpRequest, String charset) {
CloseableHttpClient httpclient;
if ("https".equals(httpRequest.getURI().getScheme())) {
httpclient = createSSLInsecureClient();
} else {
httpclient = HttpClients.createDefault();
}
String result = "";
try {
try (CloseableHttpResponse response = httpclient.execute(httpRequest)) {
HttpEntity entity = response.getEntity();
result = EntityUtils.toString(entity, charset);
EntityUtils.consume(entity);
}
} catch (IOException ex) {
ex.printStackTrace();
} finally {
try { httpclient.close(); } catch (IOException ignored) {}
}
return result;
}
public static CloseableHttpClient createSSLInsecureClient() {
try {
SSLContext sslContext = new SSLContextBuilder()
.loadTrustMaterial((X509Certificate[] chain, String authType) -> true)
.build();
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, (hostname, session) -> true);
return HttpClients.custom().setSSLSocketFactory(sslsf).build();
} catch (GeneralSecurityException ex) {
throw new RuntimeException(ex);
}
}
}Run
Network issues may cause some downloads to fail; re‑run the program or increase the retry count to improve the success rate.
Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
Java Backend Technology
Focus on Java-related technologies: SSM, Spring ecosystem, microservices, MySQL, MyCat, clustering, distributed systems, middleware, Linux, networking, multithreading. Occasionally cover DevOps tools like Jenkins, Nexus, Docker, and ELK. Also share technical insights from time to time, committed to Java full-stack development!
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
