Master Python Web Scraping: Download Pear Video Clips with Threads and Asyncio
This tutorial walks you through building a Python scraper that extracts video titles and URLs from Pear Video, then downloads the videos to your computer using single‑threaded requests, a thread‑pool, and asynchronous asyncio techniques, complete with full code examples.
Goal
Download technology‑related short videos from the Pear Video website and save them locally.
Tools
Python 3.9
PyCharm 2020
Required third‑party libraries
requests
parsel
fake_useragent
random
json
os
concurrent.futures
asyncio
aiohttp
aiofiles
Single‑thread download
Extract the video name and the real video URL, send a GET request, and write the binary content to a .mp4 file.
import requests
from parsel import Selector
from fake_useragent import UserAgent
import random, json, os
class PearVideo:
def __init__(self, page):
self.headers = {"User-Agent": UserAgent().chrome}
self.page = page
self.base_url = "https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=8&start="
def start_request(self):
for p in range(self.page):
start_url = self.base_url + str(p * 12)
res = requests.get(start_url, headers=self.headers)
if res.status_code == 200:
selector = Selector(res.text)
self.parse(selector)
def parse(self, response):
videos = response.xpath("//div[@class='vervideo-bd']")
for video in videos:
detail_url = "https://www.pearvideo.com/" + video.xpath("./a/@href").get()
video_name = video.xpath(".//div[@class='vervideo-title']/text()").get()
self.parse_detail(detail_url, video_name)
def parse_detail(self, detail_url, video_name):
detail_res = requests.get(detail_url, headers=self.headers)
detail_selector = Selector(detail_res.text)
init_cid = detail_selector.xpath("//div[@id='poster']/@data-cid").get()
mrd = random.random()
ajax_url = f"https://www.pearvideo.com/videoStatus.jsp?contId={init_cid}&mrd={mrd}"
ajax_header = {"Referer": f"https://www.pearvideo.com/video_{init_cid}"}
self.parse_ajax(ajax_url, init_cid, video_name, ajax_header)
def parse_ajax(self, ajax_url, init_cid, video_name, ajax_header):
ajax_res = requests.get(ajax_url, headers=ajax_header)
fake_video_url = json.loads(ajax_res.text)["videoInfo"]["videos"]["srcUrl"]
fake_cid = fake_video_url.split("/")[-1].split("-")[0]
real_cid = "cont-" + init_cid
real_video_url = fake_video_url.replace(fake_cid, real_cid)
self.download_video(video_name, real_video_url, ajax_header)
def download_video(self, video_name, video_url, ajax_header):
video_res = requests.get(video_url, headers=ajax_header)
video_path = os.path.join(os.getcwd(), "单线程视频下载")
if not os.path.exists(video_path):
os.mkdir(video_path)
with open(f"{video_path}/{video_name}.mp4", "wb") as f:
f.write(video_res.content)
print(f"{video_name} 下载完毕")
def run(self):
self.start_request()
if __name__ == '__main__':
pv = PearVideo(3)
pv.run()Thread‑pool download
Collect video dictionaries during parsing and use ThreadPoolExecutor to download several videos concurrently.
class PearVideo:
def __init__(self, page):
self.headers = {"User-Agent": UserAgent().chrome}
self.page = page
self.base_url = "https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=8&start="
self.video_list = []
# ... same parsing methods, but parse_ajax now appends:
def parse_ajax(self, ajax_url, init_cid, video_name, ajax_header):
ajax_res = requests.get(ajax_url, headers=ajax_header)
fake_video_url = json.loads(ajax_res.text)["videoInfo"]["videos"]["srcUrl"]
fake_cid = fake_video_url.split("/")[-1].split("-")[0]
real_cid = "cont-" + init_cid
real_video_url = fake_video_url.replace(fake_cid, real_cid)
self.video_list.append({"video_url": real_video_url, "video_name": video_name})
def download_video(self, video_dict):
video_res = requests.get(video_dict["video_url"], headers=self.headers)
video_path = os.path.join(os.getcwd(), "线程池视频下载")
if not os.path.exists(video_path):
os.mkdir(video_path)
with open(f"{video_path}/{video_dict['video_name']}.mp4", "wb") as f:
f.write(video_res.content)
print(f"{video_dict['video_name']} 下载完毕")
def run(self):
self.start_request()
from concurrent.futures import ThreadPoolExecutor
pool = ThreadPoolExecutor(4)
pool.map(self.download_video, self.video_list)
if __name__ == '__main__':
pv = PearVideo(2)
pv.run()Asyncio download
Use asynchronous functions with aiohttp and aiofiles to download videos concurrently without blocking.
class PearVideo:
def __init__(self, page):
self.headers = {"User-Agent": UserAgent().chrome}
self.page = page
self.base_url = "https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=8&start="
self.video_urls = []
self.video_names = []
# parsing methods fill self.video_urls / self.video_names
async def download_videos(self, session, video_url, video_name, video_path):
async with session.get(video_url, headers=self.headers) as res:
content = await res.content.read()
async with aiofiles.open(f"{video_path}/{video_name}.mp4", "wb") as f:
await f.write(content)
print(f"{video_name} 下载完毕")
async def main(self):
video_path = os.path.join(os.getcwd(), "协程视频下载")
if not os.path.exists(video_path):
os.mkdir(video_path)
async with aiohttp.ClientSession() as session:
tasks = [
self.download_videos(session, url, name, video_path)
for url, name in zip(self.video_urls, self.video_names)
]
await asyncio.gather(*tasks)
def run(self):
self.start_request()
asyncio.run(self.main())
if __name__ == '__main__':
pv = PearVideo(3)
pv.run()Additional notes
When constructing the real video URL, replace the fake CID in the intercepted URL with cont‑{data‑cid}. Filenames containing characters such as \ / * ? < > | must be sanitized; a helper rename function can filter illegal symbols. Further performance gains are possible by adopting a producer‑consumer pattern for URL generation and download.
Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
Python Crawling & Data Mining
Life's short, I code in Python. This channel shares Python web crawling, data mining, analysis, processing, visualization, automated testing, DevOps, big data, AI, cloud computing, machine learning tools, resources, news, technical articles, tutorial videos and learning materials. Join us!
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
