python使用xpath获取豆瓣电影排行榜数据

安装依赖插件

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
python -m pip install requests
python -m pip install lxml
python -m pip install requests python -m pip install lxml
python -m pip install requests
python -m pip install lxml

实现代码

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
# 豆瓣排行榜数据抓取
import re
import requests
from lxml import etree
from baidu_fanyi_test import check_resp
req_url = "https://movie.douban.com/chart"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0"
}
def get_html(url):
r = requests.get(url, headers=headers)
check_resp(r)
return r.text
def get_chart(html):
ele = etree.HTML(html)
tables = ele.xpath("//div[@class='indent']/div/table")
movies = []
for table in tables:
movie_infos = table.xpath(".//tr/td[@valign='top']/div[@class='pl2']")
if movie_infos is None:
continue
movie_info = movie_infos[0]
title = movie_info.xpath(".//a/text()")[0] + movie_info.xpath(".//span/text()")[0]
title = re.sub(r'[\n\s]+', ' ', title)
detail = movie_info.xpath(".//p[@class='pl']/text()")[0]
spans = movie_info.xpath(".//div[contains(@class, 'star')]/span")
score = spans[1].text
people = spans[2].text
info = {
"title": title,
"detail": detail,
"score": score,
"people": people
}
movies.append(info)
return movies
if __name__ == '__main__':
movies = get_chart(get_html(req_url))
for movie in movies:
print(f"----------{movie['title']}------------")
print(f"电影名称: {movie['title']}")
print(f"电影内容: {movie['detail']}")
print(f"电影评分: {movie['score']}/{movie['people']}")
# 豆瓣排行榜数据抓取 import re import requests from lxml import etree from baidu_fanyi_test import check_resp req_url = "https://movie.douban.com/chart" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0" } def get_html(url): r = requests.get(url, headers=headers) check_resp(r) return r.text def get_chart(html): ele = etree.HTML(html) tables = ele.xpath("//div[@class='indent']/div/table") movies = [] for table in tables: movie_infos = table.xpath(".//tr/td[@valign='top']/div[@class='pl2']") if movie_infos is None: continue movie_info = movie_infos[0] title = movie_info.xpath(".//a/text()")[0] + movie_info.xpath(".//span/text()")[0] title = re.sub(r'[\n\s]+', ' ', title) detail = movie_info.xpath(".//p[@class='pl']/text()")[0] spans = movie_info.xpath(".//div[contains(@class, 'star')]/span") score = spans[1].text people = spans[2].text info = { "title": title, "detail": detail, "score": score, "people": people } movies.append(info) return movies if __name__ == '__main__': movies = get_chart(get_html(req_url)) for movie in movies: print(f"----------{movie['title']}------------") print(f"电影名称: {movie['title']}") print(f"电影内容: {movie['detail']}") print(f"电影评分: {movie['score']}/{movie['people']}")
# 豆瓣排行榜数据抓取
import re

import requests
from lxml import etree

from baidu_fanyi_test import check_resp

req_url = "https://movie.douban.com/chart"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0"
}


def get_html(url):
    r = requests.get(url, headers=headers)
    check_resp(r)
    return r.text


def get_chart(html):
    ele = etree.HTML(html)
    tables = ele.xpath("//div[@class='indent']/div/table")
    movies = []
    for table in tables:
        movie_infos = table.xpath(".//tr/td[@valign='top']/div[@class='pl2']")
        if movie_infos is None:
            continue
        movie_info = movie_infos[0]
        title = movie_info.xpath(".//a/text()")[0] + movie_info.xpath(".//span/text()")[0]

        title = re.sub(r'[\n\s]+', ' ', title)
        detail = movie_info.xpath(".//p[@class='pl']/text()")[0]
        spans = movie_info.xpath(".//div[contains(@class, 'star')]/span")
        score = spans[1].text
        people = spans[2].text

        info = {
            "title": title,
            "detail": detail,
            "score": score,
            "people": people
        }

        movies.append(info)

    return movies


if __name__ == '__main__':
    movies = get_chart(get_html(req_url))
    for movie in movies:
        print(f"----------{movie['title']}------------")
        print(f"电影名称: {movie['title']}")
        print(f"电影内容: {movie['detail']}")
        print(f"电影评分: {movie['score']}/{movie['people']}")

 

以上就是获取豆瓣电影排行数据的代码,这里只是实现了基本的获取,没有去爬取更多的数据和电影的详情,大家可以自行扩展

Leave a Comment

Comments

No comments yet. Why don’t you start the discussion?

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注