Big Data 13 min read

How to Scrape Maoyan Movie Data and Visualize Trends with Python

This tutorial walks you through collecting movie information from Maoyan using Python web‑scraping, storing the results in CSV, and then applying pandas, matplotlib, and WordCloud to analyze and visualize trends such as release years, genres, regions, durations, and ratings across China and the world.

MaGe Linux Operations

Jan 9, 2022

How to Scrape Maoyan Movie Data and Visualize Trends with Python

Tools Preparation

Data source: https://maoyan.com/board/4?offset=1

Development environment: Windows 10, Python 3.7

IDE and browser: PyCharm, Chrome

Project Idea

The goal is to crawl all movie information from Maoyan, using the top‑100 list as an example, and extract fields such as movie name, rating, link, genre, release location, duration, and release year.

Scraping Code

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2021年06月05日
# @File    : demo4.py

import requests
from fake_useragent import UserAgent
from lxml import etree
import time

# Random request header
ua = UserAgent()

# Request headers (may need to be updated manually)
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Cookie': '__mta=244176442.1622872454168.1622876903037.1622877097390.7; uuid_n_v=v1; uuid=6FFF6D30C5C211EB8D61CF53B1EFE83FE91D3C40EE5240DCBA0A422050B1E8C0; _csrf=bff9b813020b795594ff3b2ea3c1be6295b7453d19ecd72f8beb9700c679dfb4; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1622872443; _lxsdk_cuid=1770e9ed136c8-048c356e76a22b-7d677965-1fa400-1770e9ed136c8; _lxsdk=6FFF6D30C5C211EB8D61CF53B1EFE83FE91D3C40EE5240DCBA0A422050B1E8C0; ci=59; recentCis=59',
    'User-Agent': str(ua.random)
}

def RequestsTools(url):
    """Crawler request utility function.
    :param url: request URL
    :return: parsed HTML object for xpath extraction"""
    response = requests.get(url, headers=headers).content.decode('utf-8')
    html = etree.HTML(response)
    return html

def Index(page):
    """Process the index page.
    :param page: offset value"""
    url = f'https://maoyan.com/board/4?offset={page}'
    html = RequestsTools(url)
    urls_text = html.xpath('//a[@class="image-link"]/@href')
    pingfen1 = html.xpath('//i[@class="integer"]/text()')
    pingfen2 = html.xpath('//i[@class="fraction"]/text()')
    for i, p1, p2 in zip(urls_text, pingfen1, pingfen2):
        pingfen = p1 + p2
        urs = 'https://maoyan.com' + i
        time.sleep(2)
        Details(urs, pingfen)

def Details(url, pingfen):
    html = RequestsTools(url)
    dianyan = html.xpath('//h1[@class="name"]/text()')          # movie name
    leixing = html.xpath('//li[@class="ellipsis"]/a/text()')   # genre
    diqu = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[2]/text()')   # region
    timedata = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()') # release year
    for d, l, b, t in zip(dianyan, leixing, diqu, timedata):
        countyr = b.replace('
', '').split('/')[0]   # region
        shichang = b.replace('
', '').split('/')[1]   # duration
        with open('猫眼.csv', 'a', encoding='utf-8') as f:
            f.write(f"{d}, {pingfen}, {url}, {l}, {countyr}, {shichang}, {t}
")
        print(d, pingfen, url, l, countyr, shichang, t)

for page in range(0, 11):
    page *= 10
    Index(page)

Data Visualization Tools

import pandas as pd
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# %matplotlib inline

Visualization Code

# Load CSV
path = './maoyan.csv'
df = pd.read_csv(path, sep=',', encoding='utf-8')
df.drop(df.columns[0], axis=1, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Plot number of movies per release year (before 2018)
fig, ax = plt.subplots(figsize=(9, 6), dpi=70)
(df[df['上映时间'] < 2018]['上映时间'].value_counts().sort_index()
   .plot(kind='line', ax=ax))
ax.set_xlabel('Year')
ax.set_ylabel('Number of Movies')
ax.set_title('Movies Released per Year')

# Relationship between year, count, and average rating
x = df[df['上映时间'] < 2018]['上映时间'].value_counts().sort_index().index
y = df[df['上映时间'] < 2018]['上映时间'].value_counts().sort_index().values
y2 = (df[df['上映时间'] < 2018]
      .sort_values(by='上映时间')
      .groupby('上映时间')
      .mean()['评分'].values)
fig, ax = plt.subplots(figsize=(10, 5), dpi=70)
ax.plot(x, y, label='Count')
ax.set_xlim(1980, 2017)
ax.set_xlabel('Year')
ax.set_ylabel('Count')
ax.set_title('Year, Count & Average Rating')
ax2 = ax.twinx()
ax2.plot(x, y2, c='y', ls='--', label='Rating')
ax.legend(loc=1)
ax2.legend(loc=2)

# World average rating over years
fig, ax = plt.subplots(figsize=(10, 7), dpi=60)
(df[df['评分'] > 0]
 .groupby('上映时间')
 .mean()['评分']
 .plot(kind='line', ax=ax))
ax.set_ylabel('Rating')
ax.set_title('World Average Rating by Year')

# Genre distribution
types = []
for tp in df['类型']:
    for g in tp.split(','):
        types.append(g)
genre_df = pd.DataFrame({'类型': types})
fig, ax = plt.subplots(figsize=(9, 6), dpi=60)
genre_df['类型'].value_counts().plot(kind='bar', ax=ax)
ax.set_xlabel('Genre')
ax.set_ylabel('Count')
ax.set_title('Genre Distribution')

# Duration vs Rating scatter plot
valid = df[df['评分'] > 0]
fig, ax = plt.subplots(figsize=(9, 6), dpi=70)
ax.scatter(valid['时长(min)'], valid['评分'], alpha=0.6, marker='o')
ax.set_xlabel('Duration (min)')
ax.set_ylabel('Rating')
ax.set_title('Duration vs Rating')

# China vs World comparison (1980‑2017)
china_df = df[df['地区'].str.contains('中国大陆')]
fig, ax = plt.subplots(figsize=(12, 9), dpi=60)
ax.plot(x, y1, ls='-', c='DarkTurquoise', label='World')
ax.plot(x, y2, ls='--', c='Gold', label='China')
ax.set_title('China vs World Average Rating')
ax.set_xlabel('Year')
ax.set_xlim(1980, 2017)
ax.set_ylabel('Rating')
ax.legend()

# Word cloud of movie titles
wl = ",".join(df['电影'][:15].values)
wc = WordCloud(background_color='white', font_path='C:\\Windows\\Fonts\\simkai.ttf', max_font_size=60, random_state=30)
myword = wc.generate(wl)
wc.to_file('result.jpg')
plt.imshow(myword)
plt.axis('off')
plt.show()

Result Images

The generated plots illustrate release trends, genre counts, duration‑rating distributions, and a comparison between Chinese and global movie ratings.

Original Source

Signed-in readers can open the original source through BestHub's protected redirect.

Republication Notice

This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.

Python Web Scraping Movie Data wordcloud

Written by

MaGe Linux Operations

Founded in 2009, MaGe Education is a top Chinese high‑end IT training brand. Its graduates earn 12K+ RMB salaries, and the school has trained tens of thousands of students. It offers high‑pay courses in Linux cloud operations, Python full‑stack, automation, data analysis, AI, and Go high‑concurrency architecture. Thanks to quality courses and a solid reputation, it has talent partnerships with numerous internet firms.

0 followers

Reader feedback

How this landed with the community

Rate this article

Was this worth your time?

Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.