Build a Spring Boot Web Crawler with WebMagic, MyBatis, and MySQL
This tutorial demonstrates how to combine Spring Boot, WebMagic, and MyBatis to crawl Zhihu pages, configure Maven dependencies, set up MySQL data sources, define entity and mapper classes, and schedule the crawler to run periodically, providing a complete Java web‑crawling scaffold.
WebMagic is an open‑source Java crawler framework. This guide shows how to integrate Spring Boot, WebMagic and MyBatis to crawl Zhihu pages and persist the results into a MySQL database.
1. Add Maven dependencies
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>hyzx</groupId>
<artifactId>qbasic-crawler</artifactId>
<version>1.0.0</version>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.5.21.RELEASE</version>
<relativePath/><!-- lookup parent from repository -->
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.test.skip>true</maven.test.skip>
<java.version>1.8</java.version>
<maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version>
<maven.resources.plugin.version>3.1.0</maven.resources.plugin.version>
<mysql.connector.version>5.1.47</mysql.connector.version>
<druid.spring.boot.starter.version>1.1.17</druid.spring.boot.starter.version>
<mybatis.spring.boot.starter.version>1.3.4</mybatis.spring.boot.starter.version>
<fastjson.version>1.2.58</fastjson.version>
<commons.lang3.version>3.9</commons.lang3.version>
<joda.time.version>2.10.2</joda.time.version>
<webmagic.core.version>0.7.3</webmagic.core.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.connector.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid-spring-boot-starter</artifactId>
<version>${druid.spring.boot.starter.version}</version>
</dependency>
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>${mybatis.spring.boot.starter.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>${fastjson.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>${commons.lang3.version}</version>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>${joda.time.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${webmagic.core.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>${maven.compiler.plugin.version}</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
<encoding>${project.build.sourceEncoding}</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>${maven.resources.plugin.version}</version>
<configuration>
<encoding>${project.build.sourceEncoding}</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<fork>true</fork>
<addResources>true</addResources>
</configuration>
<executions>
<execution>
<goals>
<goal>repackage</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>public</id>
<name>aliyun nexus</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<releases><enabled>true</enabled></releases>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>public</id>
<name>aliyun nexus</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<releases><enabled>true</enabled></releases>
<snapshots><enabled>false</enabled></snapshots>
</pluginRepository>
</pluginRepositories>
</project>2. application.properties
# mysql data source configuration
spring.datasource.name=mysql
spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
spring.datasource.driver-class-name=com.mysql.jdbc.Driver
spring.datasource.url=jdbc:mysql://192.168.0.63:3306/gjhzjl?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true
spring.datasource.username=root
spring.datasource.password=root
# druid pool configuration
spring.datasource.druid.initial-size=5
spring.datasource.druid.min-idle=5
spring.datasource.druid.max-active=10
spring.datasource.druid.max-wait=60000
spring.datasource.druid.validation-query=SELECT 1 FROM DUAL
spring.datasource.druid.test-on-borrow=false
spring.datasource.druid.test-on-return=false
spring.datasource.druid.test-while-idle=true
spring.datasource.druid.time-between-eviction-runs-millis=60000
spring.datasource.druid.min-evictable-idle-time-millis=300000
spring.datasource.druid.max-evictable-idle-time-millis=600000
# mybatis configuration
mybatis.mapperLocations=classpath:mapper/**/*.xml3. Database table schema
CREATE TABLE `cms_content` (
`contentId` varchar(40) NOT NULL COMMENT '内容ID',
`title` varchar(150) NOT NULL COMMENT '标题',
`content` longtext COMMENT '文章内容',
`releaseDate` datetime NOT NULL COMMENT '发布日期',
PRIMARY KEY (`contentId`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='CMS内容表';4. Entity class
import java.util.Date;
public class CmsContentPO {
private String contentId;
private String title;
private String content;
private Date releaseDate;
public String getContentId() { return contentId; }
public void setContentId(String contentId) { this.contentId = contentId; }
public String getTitle() { return title; }
public void setTitle(String title) { this.title = title; }
public String getContent() { return content; }
public void setContent(String content) { this.content = content; }
public Date getReleaseDate() { return releaseDate; }
public void setReleaseDate(Date releaseDate) { this.releaseDate = releaseDate; }
}5. Mapper interface
public interface CrawlerMapper {
int addCmsContent(CmsContentPO record);
}6. CrawlerMapper.xml
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.hyzx.qbasic.dao.CrawlerMapper">
<insert id="addCmsContent" parameterType="com.hyzx.qbasic.model.CmsContentPO">
insert into cms_content (contentId, title, releaseDate, content)
values (#{contentId,jdbcType=VARCHAR}, #{title,jdbcType=VARCHAR}, #{releaseDate,jdbcType=TIMESTAMP}, #{content,jdbcType=LONGVARCHAR})
</insert>
</mapper>7. ZhihuPageProcessor (parses Zhihu HTML)
@Component
public class ZhihuPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all());
page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
if (page.getResultItems().get("title") == null) {
page.setSkip(true);
}
}
@Override
public Site getSite() { return site; }
}8. ZhihuPipeline (stores data into MySQL)
@Component
public class ZhihuPipeline implements Pipeline {
private static final Logger LOGGER = LoggerFactory.getLogger(ZhihuPipeline.class);
@Autowired
private CrawlerMapper crawlerMapper;
@Override
public void process(ResultItems resultItems, Task task) {
String title = resultItems.get("title");
String answer = resultItems.get("answer");
CmsContentPO contentPO = new CmsContentPO();
contentPO.setContentId(UUID.randomUUID().toString());
contentPO.setTitle(title);
contentPO.setReleaseDate(new Date());
contentPO.setContent(answer);
try {
boolean success = crawlerMapper.addCmsContent(contentPO) > 0;
LOGGER.info("Saved Zhihu article successfully: {}", title);
} catch (Exception ex) {
LOGGER.error("Failed to save Zhihu article", ex);
}
}
}9. ZhihuTask (schedules the crawler)
@Component
public class ZhihuTask {
private static final Logger LOGGER = LoggerFactory.getLogger(ZhihuTask.class);
@Autowired private ZhihuPipeline zhihuPipeline;
@Autowired private ZhihuPageProcessor zhihuPageProcessor;
private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor();
public void crawl() {
timer.scheduleWithFixedDelay(() -> {
Thread.currentThread().setName("zhihuCrawlerThread");
try {
Spider.create(zhihuPageProcessor)
.addUrl("https://www.zhihu.com/explore")
.addPipeline(zhihuPipeline)
.thread(2)
.start();
} catch (Exception ex) {
LOGGER.error("Zhihu crawling thread exception", ex);
}
}, 0, 10, TimeUnit.MINUTES);
}
}10. Spring Boot application entry point
@SpringBootApplication
@MapperScan(basePackages = "com.hyzx.qbasic.dao")
public class Application implements CommandLineRunner {
@Autowired private ZhihuTask zhihuTask;
public static void main(String[] args) throws IOException {
SpringApplication.run(Application.class, args);
}
@Override
public void run(String... args) throws Exception {
zhihuTask.crawl();
}
}This complete example provides a ready‑to‑run scaffold for building Java web crawlers that fetch content from Zhihu, map it to POJOs, and store it in a MySQL database using Spring Boot, MyBatis, and WebMagic.
Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
Programmer DD
A tinkering programmer and author of "Spring Cloud Microservices in Action"
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
