Java HttpClient Web Crawler for Chinese Dictionary with Regex Extraction and MySQL Storage
This article presents a Java HttpClient-based web crawler that extracts Chinese characters and their pinyin using regular expressions, stores the data in a MySQL database, and provides complete source code along with execution results and screenshots.
The author shares a practical Java implementation that uses HttpClient to crawl Chinese dictionary pages, extracts characters and pinyin via regular expressions, and stores the results into a MySQL database.
public static void main(String[] args) throws SQLException {
DEFAULT_CHARSET = GB2312;
List<String> list = WriteRead.readTxtFileByLine(LONG_Path + "word.log");
list.forEach(py -> {
getPYAndWord(py);
});
testOver();
}
public static void getPYAndWord(String py) {
output(py);
String url = "http://zd.diyifanwen.com/zidian/py/" + py + ".htm";
HttpGet httpGet = getHttpGet(url);
JSONObject response = getHttpResponse(httpGet);
// output(response);
String content = response.getString("content");
String all = new String(content.getBytes(UTF_8), UTF_8);
List<String> regexAll = new ArrayList<>();
List<String> alllist = regexAll(all, "http://zd.d.*?>[\\u4e00-\\u9FFF]<");
output(alllist.size());
alllist.forEach(line -> {
String murl = regexAll(line, "http://zd.diyifanwen.com/zidian/\\w/\\d+.htm").get(0);
String mword = regexAll(line, ">&[\\u4e00-\\u9fa5]<").get(0);
regexAll.add(mword);
output(murl, mword);
String sql = "INSERT INTO chinese_dictionary_word (word,url) VALUES (\"%s\",\"%s\");";
sql = String.format(sql, mword.replaceAll("<|>", EMPTY), murl);
output(sql);
MySqlTest.sendWork(sql);
});
String str = regexAll.toString().replaceAll("<|>|\\[|\\]", EMPTY);
String sql = "INSERT INTO chinese_dictionary_py_word (py,words) VALUES (\"%s\",\"%s\");";
sql = String.format(sql, py, str);
output(sql);
MySqlTest.sendWork(sql);
sleep(2);
}
/**获取拼音列表
* @return
*/
public static String getPY() {
String url = "http://zd.diyifanwen.com/zidian/py/";
HttpGet httpGet = getHttpGet(url);
JSONObject response = getHttpResponse(httpGet);
// output(response);
String content = response.getString("content");
byte[] bytes = content.getBytes(UTF_8);
String all = new String(bytes, UTF_8);
Log.log("content", all);
return all;
}
/**获取所有首字母和拼音
* @param all
*/
public static void getAllPY(String all) {
List<String> list = regexAll(all, "<dt class=\"pyTitle\">拼音首字母\\w+</dt>" + LINE + ".+/dd>");
list.forEach(s -> {
int num = s.indexOf("拼音首字母");
String first = s.substring(num + 5, num + 6);
List<String> list1 = regexAll(s, "http://zd.diyifanwen.com/zidian/py/\\w+.htm");
list1.forEach(str -> {
int one = str.indexOf("/py/");
int two = str.lastIndexOf(".");
String second = str.substring(one + 4, two);
String sql = "INSERT INTO chinese_dictionary_py (first_word,all_word) VALUES (\"%s\",\"%s\");";
String sqlEnd = String.format(sql, first, second);
MySqlTest.sendWork(sqlEnd);
});
});
}
/**检查拼音是否全部获取到
* @param all
*/
public static void checkPY(String all) {
List<String> list = regexAll(all, "zidian/py/\\w+.htm");
list.forEach(str -> {
int one = str.indexOf("/py/");
int two = str.lastIndexOf(".");
String second = str.substring(one + 4, two);
output(second);
String sql = "SELECT * FROM chinese_dictionary_py WHERE all_word = \"%s\";";
String sq = String.format(sql, second);
ResultSet resultSet = MySqlTest.excuteQuerySql(sq);
try {
if (!resultSet.next()) output(sq);
} catch (SQLException e) {
e.printStackTrace();
}
});
}
/**从数据库中查找当前获取的拼音并存储到文件中
* @throws SQLException
*/
public static void getAllPY() throws SQLException {
List<String> word = new ArrayList<>();
ResultSet resultSet = MySqlTest.excuteQuerySql("SELECT all_word FROM chinese_dictionary_py;");
while (resultSet.next()) {
String string = resultSet.getString(1);
word.add(string);
}
Save.saveStringList(word, "word");
}The code includes methods to fetch the list of pinyin, retrieve all words for each pinyin, verify completeness, and save the collected data, with utility functions for HTTP requests, regex extraction, and database operations.
Running the program produces output showing the number of words collected and the generated SQL INSERT statements; screenshots of the results are provided.
Additional sections list curated technical and non‑technical articles for further reading.
Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
