Backend Development 16 min read

Master Python Web Scraping: Download Pear Video Clips with Threads and Asyncio

This tutorial walks you through building a Python scraper that extracts video titles and URLs from Pear Video, then downloads the videos to your computer using single‑threaded requests, a thread‑pool, and asynchronous asyncio techniques, complete with full code examples.

Python Crawling & Data Mining

Jan 5, 2022

Master Python Web Scraping: Download Pear Video Clips with Threads and Asyncio

Goal

Download technology‑related short videos from the Pear Video website and save them locally.

Tools

Python 3.9

PyCharm 2020

Required third‑party libraries

requests

parsel

fake_useragent

random

json

concurrent.futures

asyncio

aiohttp

aiofiles

Single‑thread download

Extract the video name and the real video URL, send a GET request, and write the binary content to a .mp4 file.

import requests
from parsel import Selector
from fake_useragent import UserAgent
import random, json, os

class PearVideo:
    def __init__(self, page):
        self.headers = {"User-Agent": UserAgent().chrome}
        self.page = page
        self.base_url = "https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=8&start="

    def start_request(self):
        for p in range(self.page):
            start_url = self.base_url + str(p * 12)
            res = requests.get(start_url, headers=self.headers)
            if res.status_code == 200:
                selector = Selector(res.text)
                self.parse(selector)

    def parse(self, response):
        videos = response.xpath("//div[@class='vervideo-bd']")
        for video in videos:
            detail_url = "https://www.pearvideo.com/" + video.xpath("./a/@href").get()
            video_name = video.xpath(".//div[@class='vervideo-title']/text()").get()
            self.parse_detail(detail_url, video_name)

    def parse_detail(self, detail_url, video_name):
        detail_res = requests.get(detail_url, headers=self.headers)
        detail_selector = Selector(detail_res.text)
        init_cid = detail_selector.xpath("//div[@id='poster']/@data-cid").get()
        mrd = random.random()
        ajax_url = f"https://www.pearvideo.com/videoStatus.jsp?contId={init_cid}&mrd={mrd}"
        ajax_header = {"Referer": f"https://www.pearvideo.com/video_{init_cid}"}
        self.parse_ajax(ajax_url, init_cid, video_name, ajax_header)

    def parse_ajax(self, ajax_url, init_cid, video_name, ajax_header):
        ajax_res = requests.get(ajax_url, headers=ajax_header)
        fake_video_url = json.loads(ajax_res.text)["videoInfo"]["videos"]["srcUrl"]
        fake_cid = fake_video_url.split("/")[-1].split("-")[0]
        real_cid = "cont-" + init_cid
        real_video_url = fake_video_url.replace(fake_cid, real_cid)
        self.download_video(video_name, real_video_url, ajax_header)

    def download_video(self, video_name, video_url, ajax_header):
        video_res = requests.get(video_url, headers=ajax_header)
        video_path = os.path.join(os.getcwd(), "单线程视频下载")
        if not os.path.exists(video_path):
            os.mkdir(video_path)
        with open(f"{video_path}/{video_name}.mp4", "wb") as f:
            f.write(video_res.content)
        print(f"{video_name} 下载完毕")

    def run(self):
        self.start_request()

if __name__ == '__main__':
    pv = PearVideo(3)
    pv.run()

Thread‑pool download

Collect video dictionaries during parsing and use ThreadPoolExecutor to download several videos concurrently.

class PearVideo:
    def __init__(self, page):
        self.headers = {"User-Agent": UserAgent().chrome}
        self.page = page
        self.base_url = "https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=8&start="
        self.video_list = []
    # ... same parsing methods, but parse_ajax now appends:
    def parse_ajax(self, ajax_url, init_cid, video_name, ajax_header):
        ajax_res = requests.get(ajax_url, headers=ajax_header)
        fake_video_url = json.loads(ajax_res.text)["videoInfo"]["videos"]["srcUrl"]
        fake_cid = fake_video_url.split("/")[-1].split("-")[0]
        real_cid = "cont-" + init_cid
        real_video_url = fake_video_url.replace(fake_cid, real_cid)
        self.video_list.append({"video_url": real_video_url, "video_name": video_name})
    def download_video(self, video_dict):
        video_res = requests.get(video_dict["video_url"], headers=self.headers)
        video_path = os.path.join(os.getcwd(), "线程池视频下载")
        if not os.path.exists(video_path):
            os.mkdir(video_path)
        with open(f"{video_path}/{video_dict['video_name']}.mp4", "wb") as f:
            f.write(video_res.content)
        print(f"{video_dict['video_name']} 下载完毕")
    def run(self):
        self.start_request()
        from concurrent.futures import ThreadPoolExecutor
        pool = ThreadPoolExecutor(4)
        pool.map(self.download_video, self.video_list)

if __name__ == '__main__':
    pv = PearVideo(2)
    pv.run()

Asyncio download

Use asynchronous functions with aiohttp and aiofiles to download videos concurrently without blocking.

class PearVideo:
    def __init__(self, page):
        self.headers = {"User-Agent": UserAgent().chrome}
        self.page = page
        self.base_url = "https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=8&start="
        self.video_urls = []
        self.video_names = []
    # parsing methods fill self.video_urls / self.video_names
    async def download_videos(self, session, video_url, video_name, video_path):
        async with session.get(video_url, headers=self.headers) as res:
            content = await res.content.read()
            async with aiofiles.open(f"{video_path}/{video_name}.mp4", "wb") as f:
                await f.write(content)
            print(f"{video_name} 下载完毕")
    async def main(self):
        video_path = os.path.join(os.getcwd(), "协程视频下载")
        if not os.path.exists(video_path):
            os.mkdir(video_path)
        async with aiohttp.ClientSession() as session:
            tasks = [
                self.download_videos(session, url, name, video_path)
                for url, name in zip(self.video_urls, self.video_names)
            ]
            await asyncio.gather(*tasks)
    def run(self):
        self.start_request()
        asyncio.run(self.main())

if __name__ == '__main__':
    pv = PearVideo(3)
    pv.run()

Additional notes

When constructing the real video URL, replace the fake CID in the intercepted URL with cont‑{data‑cid}. Filenames containing characters such as \ / * ? < > | must be sanitized; a helper rename function can filter illegal symbols. Further performance gains are possible by adopting a producer‑consumer pattern for URL generation and download.

Original Source

Signed-in readers can open the original source through BestHub's protected redirect.

Republication Notice

This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.

Python multithreading asyncio web-scraping Video Downloading pearvideo

Written by

Python Crawling & Data Mining

Life's short, I code in Python. This channel shares Python web crawling, data mining, analysis, processing, visualization, automated testing, DevOps, big data, AI, cloud computing, machine learning tools, resources, news, technical articles, tutorial videos and learning materials. Join us!

0 followers

Reader feedback

How this landed with the community

Rate this article

Was this worth your time?

Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.