安装依赖插件
python -m pip install requests
python -m pip install lxml
python -m pip install requests
python -m pip install lxml
python -m pip install requests python -m pip install lxml
实现代码
# 豆瓣排行榜数据抓取
import re
import requests
from lxml import etree
from baidu_fanyi_test import check_resp
req_url = "https://movie.douban.com/chart"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0"
}
def get_html(url):
r = requests.get(url, headers=headers)
check_resp(r)
return r.text
def get_chart(html):
ele = etree.HTML(html)
tables = ele.xpath("//div[@class='indent']/div/table")
movies = []
for table in tables:
movie_infos = table.xpath(".//tr/td[@valign='top']/div[@class='pl2']")
if movie_infos is None:
continue
movie_info = movie_infos[0]
title = movie_info.xpath(".//a/text()")[0] + movie_info.xpath(".//span/text()")[0]
title = re.sub(r'[\n\s]+', ' ', title)
detail = movie_info.xpath(".//p[@class='pl']/text()")[0]
spans = movie_info.xpath(".//div[contains(@class, 'star')]/span")
score = spans[1].text
people = spans[2].text
info = {
"title": title,
"detail": detail,
"score": score,
"people": people
}
movies.append(info)
return movies
if __name__ == '__main__':
movies = get_chart(get_html(req_url))
for movie in movies:
print(f"----------{movie['title']}------------")
print(f"电影名称: {movie['title']}")
print(f"电影内容: {movie['detail']}")
print(f"电影评分: {movie['score']}/{movie['people']}")
# 豆瓣排行榜数据抓取
import re
import requests
from lxml import etree
from baidu_fanyi_test import check_resp
req_url = "https://movie.douban.com/chart"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0"
}
def get_html(url):
r = requests.get(url, headers=headers)
check_resp(r)
return r.text
def get_chart(html):
ele = etree.HTML(html)
tables = ele.xpath("//div[@class='indent']/div/table")
movies = []
for table in tables:
movie_infos = table.xpath(".//tr/td[@valign='top']/div[@class='pl2']")
if movie_infos is None:
continue
movie_info = movie_infos[0]
title = movie_info.xpath(".//a/text()")[0] + movie_info.xpath(".//span/text()")[0]
title = re.sub(r'[\n\s]+', ' ', title)
detail = movie_info.xpath(".//p[@class='pl']/text()")[0]
spans = movie_info.xpath(".//div[contains(@class, 'star')]/span")
score = spans[1].text
people = spans[2].text
info = {
"title": title,
"detail": detail,
"score": score,
"people": people
}
movies.append(info)
return movies
if __name__ == '__main__':
movies = get_chart(get_html(req_url))
for movie in movies:
print(f"----------{movie['title']}------------")
print(f"电影名称: {movie['title']}")
print(f"电影内容: {movie['detail']}")
print(f"电影评分: {movie['score']}/{movie['people']}")
# 豆瓣排行榜数据抓取 import re import requests from lxml import etree from baidu_fanyi_test import check_resp req_url = "https://movie.douban.com/chart" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0" } def get_html(url): r = requests.get(url, headers=headers) check_resp(r) return r.text def get_chart(html): ele = etree.HTML(html) tables = ele.xpath("//div[@class='indent']/div/table") movies = [] for table in tables: movie_infos = table.xpath(".//tr/td[@valign='top']/div[@class='pl2']") if movie_infos is None: continue movie_info = movie_infos[0] title = movie_info.xpath(".//a/text()")[0] + movie_info.xpath(".//span/text()")[0] title = re.sub(r'[\n\s]+', ' ', title) detail = movie_info.xpath(".//p[@class='pl']/text()")[0] spans = movie_info.xpath(".//div[contains(@class, 'star')]/span") score = spans[1].text people = spans[2].text info = { "title": title, "detail": detail, "score": score, "people": people } movies.append(info) return movies if __name__ == '__main__': movies = get_chart(get_html(req_url)) for movie in movies: print(f"----------{movie['title']}------------") print(f"电影名称: {movie['title']}") print(f"电影内容: {movie['detail']}") print(f"电影评分: {movie['score']}/{movie['people']}")
以上就是获取豆瓣电影排行数据的代码,这里只是实现了基本的获取,没有去爬取更多的数据和电影的详情,大家可以自行扩展