Crack Custom Font Anti‑Scraping on Dianping: A Complete Python Guide
This article walks through decoding custom font anti‑scraping on a Chinese group‑buying site using Python, covering page fetching, CSS font URL extraction, .woff download, glyph mapping with fontTools, OCR via ddddocr, DOM manipulation with BeautifulSoup, and exporting the cleaned data to Excel.
Deep Dive into Custom Font Parsing
First, understand the difference between custom fonts and regular fonts: custom fonts map special Unicode code points to bitmap glyphs, while regular fonts render standard characters directly.
Python Loading the Page
import requests
headers = {
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "zh-CN,zh;q=0.9"
}
session = requests.Session()
session.headers = headers
res = session.get("http://www.dianping.com/shenzhen/ch30")Parse the downloaded page with BeautifulSoup:
from bs4 import BeautifulSoup
soup = BeautifulSoup(res.text, 'html5lib')Extracting CSS URL for Custom Fonts
The CSS file that defines custom fonts contains the keyword svgtextcss in its URL.
from urllib import parse
def getUrlFromNode(nodes, tag):
for node in nodes:
url = node['href']
if url.find(tag) != -1:
return parse.urljoin(base_url, url)
def get_css_url(soup):
css_url = getUrlFromNode(soup.select("head > link[rel=stylesheet]"), "svgtextcss")
return css_urlParsing CSS to Get Font URLs
def parseCssFontUrl(css_url, tag=None, only_First=True):
res = session.get(css_url)
rule = {}
font_face = {}
for name, value in re.findall("([^{}]+){([^{}]+)}", res.text):
name = name.strip()
for row in value.split(";"):
if row.find(":") == -1:
continue
k, v = row.split(":")
k, v = k.strip(), v.strip(' "\'')
if name == "@font-face":
if k == "font-family":
font_name = v
elif k == "src":
font_face.setdefault(font_name, []).extend(re.findall("url\(\"([^()]+)\"\)", v))
else:
rule[name[1:]] = v
font_urls = {}
for class_name, tag_name in rule.items():
font_urls[class_name] = get_url(font_face[tag_name], tag)
return font_urlsDownloading the .woff Fonts
def download_file(url, out_name=None):
if out_name is None:
out_name = url[url.rfind("/")+1:]
with open(out_name, "wb") as f:
f.write(session.get(url).content)
for class_name, url in font_urls.items():
download_file(url, f"{class_name}.woff")After downloading, inspect the fonts with FontCreator (or any font viewer) to confirm that the glyph order is consistent across files.
Building the Glyph‑to‑Character Mapping
Use fontTools to read the glyph order, then map each glyph to its real character using OCR on rendered glyph images.
from fontTools.ttLib import TTFont
from PIL import ImageFont, Image, ImageDraw
words = '1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福饭人百餐茶务通味所山区门药银农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建材三会鸡室红站德王光名丽油院堂烧江社合星货型村自科快便日民营和活童明器烟育宾精屋经居庄石顺林尔县手厅销用好客火雅盛体旅之鞋辣作粉包楼校鱼平彩上吧保永万物教吃设医正造丰健点汤网庆技斯洗料配汇木缘加麻联卫川泰色世方寓风幼羊烫来高厂兰阿皮全女拉成云维贸道术运都口博河瑞宏京际路祥青镇厨培力惠连马鸿钢训影甲助窗布富牌头四多妆吉苑沙恒隆春干饼氏里二管诚制售嘉长轩杂副清计黄讯太鸭号街交与叉附近层旁对巷栋环省桥湖段乡厦铺内侧元购前幢滨处向座下凤港开关景泉塘放昌线湾政步宁解白田町溪十八古双胜本单同九迎台玉锦底后七斜期武岭松角纪朝峰六振珠局岗洲横边济井办汉代临弄塔杨铁浦字年岛陵原梅进荣友虹央桂沿事津凯莲丁秀柳集紫旗张谷的是不了很还个也这我就在以可到错没去过感次要比觉看得说常真们但最喜哈么别位能较境非为欢然他挺着价那意种想出员两推做排实分间甜度起满给热完格荐喝等其再几只朋候样直而买于般豆量选奶打每评少算因情找份适蛋师气你姐棒试总定啊足级整带虾如态且尝主话强当更板知己无酸让入啦式笑赞片酱差像提队走嫩才刚午接重串回晚微周值费性桌拍块调糕'
tfont = TTFont("tagName.woff")
uni_list = tfont.getGlyphOrder()[2:]
font = ImageFont.truetype("tagName.woff", 20)
imgs = []
for i in range(0, len(uni_list), 25):
im = Image.new(mode='RGB', size=(20*25+10, 22), color="white")
draw = ImageDraw.Draw(im)
unknown_chars = "".join(uni_list[i:i+25]).replace("uni", "\\u")
unknown_chars = unknown_chars.encode().decode("unicode_escape")
draw.text(xy=(5, -4), text=unknown_chars, fill=0, font=font)
imgs.append(im)Run OCR on each image using ddddocr (a lightweight OCR library that supports Chinese):
from ddddocr import DdddOcr
from io import BytesIO
def get_img_bytes(img):
buf = BytesIO()
img.save(buf, format='JPEG')
return buf.getvalue()
ocr = DdddOcr()
result = []
for im in imgs:
text = ocr.classification(get_img_bytes(im))
result.append(text)The OCR output yields a high‑accuracy character set that can be used to build a mapping from glyph code to real text.
font_map = dict(zip(map(lambda x: x[3:], uni_list), words)Replacing Custom Glyphs in the DOM
def fix_text(soup):
css_url = get_css_url(soup)
for svgmtsi in soup.find_all('svgmtsi'):
class_name = svgmtsi['class'][0]
font_map = getFontMapFromClassName(class_name, css_url)
chars = []
for c in svgmtsi.text:
char = c.encode("unicode_escape").decode()[2:]
chars.append(font_map[char])
svgmtsi.replaceWith("".join(chars))Extracting Structured Data
num_rule = re.compile("\d+")
def parse_data(soup):
result = []
for li in soup.select("div#shop-all-list div.txt"):
title = li.select_one("div.tit>a>h4").text
url = li.select_one("div.tit>a")['href']
star_class = li.select_one("div.comment>div.nebula_star>div.star_icon>span")['class']
star = int(num_rule.findall(" ".join(star_class))[0]) // 10
comment_tag = li.select_one("div.comment>a.review-num>b")
comment_num = comment_tag.text if comment_tag else None
price_tag = li.select_one("div.comment>a.mean-price>b")
mean_price = price_tag.text if price_tag else None
fun_type = li.select_one("div.tag-addr>a:nth-of-type(1)>span.tag").text
area = li.select_one("div.tag-addr>a:nth-of-type(2)>span.tag").text
result.append((title, star, comment_num, mean_price, fun_type, area, url))
return resultBatch Crawling and Export
headers = {"User-Agent": "Mozilla/5.0 ..."}
session = requests.Session()
session.headers = headers
base_url = "http://www.dianping.com/shenzhen/ch30"
res = session.get(base_url)
soup = BeautifulSoup(res.text, 'html5lib')
# Get category links
type_list = []
for a in soup.select("div#classfy>a"):
type_list.append((a.span.text, a['href'] + 'r91172'))
all_results = []
for name, url in type_list:
page = session.get(url)
page_soup = BeautifulSoup(page.text, 'html5lib')
fix_text(page_soup)
all_results.extend(parse_data(page_soup))
time.sleep(random.randint(2,4))
import pandas as pd
df = pd.DataFrame(all_results, columns=["标题","星级","评论数","均价","娱乐类型","区域","链接"])
df.to_excel("华南城娱乐.xlsx", index=False)Overall, the site’s anti‑scraping measures are strong; the tutorial demonstrates how to defeat the hardest part—custom font obfuscation—so that the remaining data can be scraped smoothly.
Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
Python Crawling & Data Mining
Life's short, I code in Python. This channel shares Python web crawling, data mining, analysis, processing, visualization, automated testing, DevOps, big data, AI, cloud computing, machine learning tools, resources, news, technical articles, tutorial videos and learning materials. Join us!
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
