Build a Python Baidu Baike Crawler: Step-by-Step Guide
This article demonstrates how to create a Python web crawler that fetches Baidu Baike entries, covering the main program structure, URL manager, page downloader, HTML parser using BeautifulSoup, and output generator, with complete code snippets and sample results.
This article demonstrates a Python web crawler that extracts Baidu Baike entries.
Crawler Main Program Entry
from crawler_test.html_downloader import UrlDownLoader
from crawler_test.html_outer import HtmlOuter
from crawler_test.html_parser import HtmlParser
from crawler_test.url_manager import UrlManager
# Crawler main entry
class MainCrawler():
def __init__(self):
# Initialize four processors: URL manager, downloader, parser, outputter
self.urls = UrlManager()
self.downloader = UrlDownLoader()
self.parser = HtmlParser()
self.outer = HtmlOuter()
def start_craw(self, main_url):
print('爬虫开始...')
count = 1
self.urls.add_new_url(main_url)
while self.urls.has_new_url():
try:
new_url = self.urls.get_new_url()
print('爬虫%d,%s' % (count, new_url))
html_cont = self.downloader.down_load(new_url)
new_urls, new_data = self.parser.parse(new_url, html_cont)
# Add parsed URLs to manager, data to outputter
self.urls.add_new_urls(new_urls)
self.outer.conllect_data(new_data)
if count >= 10: # control number of crawls
break
count += 1
except:
print('爬虫失败一条')
self.outer.output()
print('爬虫结束。')
if __name__ == '__main__':
main_url = 'https://baike.baidu.com/item/Python/407313'
mc = MainCrawler()
mc.start_craw(main_url)URL Manager
class UrlManager():
def __init__(self):
self.new_urls = set() # URLs to crawl
self.old_urls = set() # Already crawled URLs
def add_new_url(self, url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
def has_new_url(self):
return len(self.new_urls) != 0
def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_urlWeb Page Downloader
from urllib import request
class UrlDownLoader():
def down_load(self, url):
if url is None:
return None
rt = request.Request(url=url, method='GET') # Send GET request
with request.urlopen(rt) as rp:
if rp.status != 200:
return None
return rp.read() # Read page contentWeb Page Parser
import re
from urllib import parse
from bs4 import BeautifulSoup
class HtmlParser():
def _get_new_url(self, main_url, soup):
new_urls = set()
child_urls = soup.find_all('a', href=re.compile(r'/item/(\%\w{2})+'))
for child_url in child_urls:
new_url = child_url['href']
full_url = parse.urljoin(main_url, new_url)
new_urls.add(full_url)
return new_urls
def _get_new_data(self, main_url, soup):
new_datas = {}
new_datas['url'] = main_url
new_datas['title'] = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1').get_text()
new_datas['content'] = soup.find('div', attrs={'label-module': 'lemmaSummary'}, class_='lemma-summary').get_text()
return new_datas
def parse(self, main_url, html_cont):
if main_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont, 'lxml', from_encoding='utf-8')
new_url = self._get_new_url(main_url, soup)
new_data = self._get_new_data(main_url, soup)
return new_url, new_dataOutput Processor
class HtmlOuter():
def __init__(self):
self.datas = []
def conllect_data(self, data):
if data is None:
return
self.datas.append(data)
return self.datas
def output(self, file='output_html.html'):
with open(file, 'w', encoding='utf-8') as fh:
fh.write('<html>')
fh.write('<head>')
fh.write('<meta charset="utf-8">')
fh.write('<title>爬虫数据结果</title>')
fh.write('</head>')
fh.write('<body>')
fh.write('<table style="border-collapse:collapse; border:1px solid gray; width:80%; word-break:break-all; margin:20px auto;">')
fh.write('<tr>')
fh.write('<th style="border:1px solid black; width:35%;">URL</th>')
fh.write('<th style="border:1px solid black; width:15%;">词条</th>')
fh.write('<th style="border:1px solid black; width:50%;">内容</th>')
fh.write('</tr>')
for data in self.datas:
fh.write('<tr>')
fh.write('<td style="border:1px solid black">{0}</td>'.format(data['url']))
fh.write('<td style="border:1px solid black">{0}</td>'.format(data['title']))
fh.write('<td style="border:1px solid black">{0}</td>'.format(data['content']))
fh.write('</tr>')
fh.write('</table>')
fh.write('</body>')
fh.write('</html>')The sample output shows a table with URLs, entry titles, and extracted content, confirming that the crawler works as intended.
Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
MaGe Linux Operations
Founded in 2009, MaGe Education is a top Chinese high‑end IT training brand. Its graduates earn 12K+ RMB salaries, and the school has trained tens of thousands of students. It offers high‑pay courses in Linux cloud operations, Python full‑stack, automation, data analysis, AI, and Go high‑concurrency architecture. Thanks to quality courses and a solid reputation, it has talent partnerships with numerous internet firms.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
