Backend Development 8 min read

Build a Python Baidu Baike Crawler: Step-by-Step Guide

This article demonstrates how to create a Python web crawler that fetches Baidu Baike entries, covering the main program structure, URL manager, page downloader, HTML parser using BeautifulSoup, and output generator, with complete code snippets and sample results.

MaGe Linux Operations

Sep 2, 2021

Build a Python Baidu Baike Crawler: Step-by-Step Guide

This article demonstrates a Python web crawler that extracts Baidu Baike entries.

Crawler Main Program Entry

from crawler_test.html_downloader import UrlDownLoader
from crawler_test.html_outer import HtmlOuter
from crawler_test.html_parser import HtmlParser
from crawler_test.url_manager import UrlManager

# Crawler main entry
class MainCrawler():
    def __init__(self):
        # Initialize four processors: URL manager, downloader, parser, outputter
        self.urls = UrlManager()
        self.downloader = UrlDownLoader()
        self.parser = HtmlParser()
        self.outer = HtmlOuter()

    def start_craw(self, main_url):
        print('爬虫开始...')
        count = 1
        self.urls.add_new_url(main_url)
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()
                print('爬虫%d,%s' % (count, new_url))
                html_cont = self.downloader.down_load(new_url)
                new_urls, new_data = self.parser.parse(new_url, html_cont)
                # Add parsed URLs to manager, data to outputter
                self.urls.add_new_urls(new_urls)
                self.outer.conllect_data(new_data)
                if count >= 10:  # control number of crawls
                    break
                count += 1
            except:
                print('爬虫失败一条')
        self.outer.output()
        print('爬虫结束。')

if __name__ == '__main__':
    main_url = 'https://baike.baidu.com/item/Python/407313'
    mc = MainCrawler()
    mc.start_craw(main_url)

URL Manager

class UrlManager():
    def __init__(self):
        self.new_urls = set()  # URLs to crawl
        self.old_urls = set()  # Already crawled URLs

    def add_new_url(self, url):
        if url is None:
            return
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)

    def add_new_urls(self, urls):
        if urls is None or len(urls) == 0:
            return
        for url in urls:
            self.add_new_url(url)

    def has_new_url(self):
        return len(self.new_urls) != 0

    def get_new_url(self):
        new_url = self.new_urls.pop()
        self.old_urls.add(new_url)
        return new_url

Web Page Downloader

from urllib import request

class UrlDownLoader():
    def down_load(self, url):
        if url is None:
            return None
        rt = request.Request(url=url, method='GET')  # Send GET request
        with request.urlopen(rt) as rp:
            if rp.status != 200:
                return None
            return rp.read()  # Read page content

Web Page Parser

import re
from urllib import parse
from bs4 import BeautifulSoup

class HtmlParser():
    def _get_new_url(self, main_url, soup):
        new_urls = set()
        child_urls = soup.find_all('a', href=re.compile(r'/item/(\%\w{2})+'))
        for child_url in child_urls:
            new_url = child_url['href']
            full_url = parse.urljoin(main_url, new_url)
            new_urls.add(full_url)
        return new_urls

    def _get_new_data(self, main_url, soup):
        new_datas = {}
        new_datas['url'] = main_url
        new_datas['title'] = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1').get_text()
        new_datas['content'] = soup.find('div', attrs={'label-module': 'lemmaSummary'}, class_='lemma-summary').get_text()
        return new_datas

    def parse(self, main_url, html_cont):
        if main_url is None or html_cont is None:
            return
        soup = BeautifulSoup(html_cont, 'lxml', from_encoding='utf-8')
        new_url = self._get_new_url(main_url, soup)
        new_data = self._get_new_data(main_url, soup)
        return new_url, new_data

Output Processor

class HtmlOuter():
    def __init__(self):
        self.datas = []

    def conllect_data(self, data):
        if data is None:
            return
        self.datas.append(data)
        return self.datas

    def output(self, file='output_html.html'):
        with open(file, 'w', encoding='utf-8') as fh:
            fh.write('<html>')
            fh.write('<head>')
            fh.write('<meta charset="utf-8">')
            fh.write('<title>爬虫数据结果</title>')
            fh.write('</head>')
            fh.write('<body>')
            fh.write('<table style="border-collapse:collapse; border:1px solid gray; width:80%; word-break:break-all; margin:20px auto;">')
            fh.write('<tr>')
            fh.write('<th style="border:1px solid black; width:35%;">URL</th>')
            fh.write('<th style="border:1px solid black; width:15%;">词条</th>')
            fh.write('<th style="border:1px solid black; width:50%;">内容</th>')
            fh.write('</tr>')
            for data in self.datas:
                fh.write('<tr>')
                fh.write('<td style="border:1px solid black">{0}</td>'.format(data['url']))
                fh.write('<td style="border:1px solid black">{0}</td>'.format(data['title']))
                fh.write('<td style="border:1px solid black">{0}</td>'.format(data['content']))
                fh.write('</tr>')
            fh.write('</table>')
            fh.write('</body>')
            fh.write('</html>')

The sample output shows a table with URLs, entry titles, and extracted content, confirming that the crawler works as intended.

Original Source

Signed-in readers can open the original source through BestHub's protected redirect.

Republication Notice

This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.

html-parsing Python beautifulsoup Web Crawler data-extraction baidu-baike

Written by

MaGe Linux Operations

Founded in 2009, MaGe Education is a top Chinese high‑end IT training brand. Its graduates earn 12K+ RMB salaries, and the school has trained tens of thousands of students. It offers high‑pay courses in Linux cloud operations, Python full‑stack, automation, data analysis, AI, and Go high‑concurrency architecture. Thanks to quality courses and a solid reputation, it has talent partnerships with numerous internet firms.

0 followers

Reader feedback

How this landed with the community

Rate this article

Was this worth your time?

Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.