使用xpath+requests抓取猪八戒网站数据

声明:技术只是用于学习只用,并没有其他的用途,一切对网站破坏行为,与本站无关!

最近学习python实现爬虫,目前还处于初级阶段,主要通过使用xpath + requests获取网页源码,然后对网页进行解析,处于学习的目的,对猪八戒的信息进行获取并解析输出,数据不存储,只出于学习目的,记录笔记。

依赖包

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
pip install requests
pip install lxml
pip install requests pip install lxml
pip install requests
pip install lxml

这两个包的用处如下:

  • requests: 主要用于网络请求,发送http请求到远程服务起
  • lxml: 主要是对获取到的网页信息进行解析,并能够使用xpath语法快速获取到我们需要的网页节点

代码实现

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
# 获取猪八戒网的数据信息
import requests
from lxml import etree
from baidu_fanyi_test import check_resp
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"authority": 'www.zbj.com',
"method": "GET",
}
def search_key(key) -> str:
if key is None or key == '':
print("请输入搜索关键字")
return None
req_url = f"https://www.zbj.com/fw/?k={key}"
r = requests.get(req_url, headers=headers)
check_resp(r)
return r.text
def get_search_data(html):
if html is None or html == '':
print("html为空")
exit(1)
html = etree.HTML(html)
content_list = html.xpath("//div[@class='service-card-wrap']")
if len(content_list) == 0:
print("没有匹配到对应的数据")
exit(2)
search_infos = []
for content in content_list:
info = parse_info(content)
if info is not None:
search_infos.append(info)
return search_infos
def parse_info(content):
bot_content = content.xpath(".//div[@class='bot-content']")
if len(bot_content) == 0:
return None
bot_content = bot_content[0]
price = bot_content.xpath(".//div[@class='price']/span/text()")
title = bot_content.xpath(".//div[@class='name-pic-box']/a/span/text()")
key_word = bot_content.xpath(".//div[@class='name-pic-box']/a/span/h1/text()")
final_title = []
key_work_len = len(key_word)
for i, v in enumerate(title):
final_title.append(v)
if key_work_len > 0:
if i >= key_word:
continue
final_title.append(key_word[i])
sales = bot_content.xpath(".//div[@class='descprit-box']/div[@class='sales']//span[@class='num']/text()")
good_num = bot_content.xpath(".//div[@class='descprit-box']/div[@class='evaluate']/span[@class='num']/text()")
shop_detail = content.xpath(".//div[@class='shop-detail']")[0]
score = shop_detail.xpath(".//span[@class='shop-score']/text()")
com_name = shop_detail.xpath(".//div[contains(@class, 'shop-info')]/text()")
return {
"title": "".join(final_title),
"price": price[0],
"sales": len(sales) == 0 and "暂无销量" or sales[0],
"score": len(score) == 0 and "暂无评分" or score[0],
"com_name": len(com_name) == 0 and "暂无商家" or com_name[0],
"good_num": len(good_num) == 0 and "暂无好评" or good_num[0]
}
if __name__ == '__main__':
key = "ERP"
for info in get_search_data(search_key(key)):
print(info)
# 获取猪八戒网的数据信息 import requests from lxml import etree from baidu_fanyi_test import check_resp headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "authority": 'www.zbj.com', "method": "GET", } def search_key(key) -> str: if key is None or key == '': print("请输入搜索关键字") return None req_url = f"https://www.zbj.com/fw/?k={key}" r = requests.get(req_url, headers=headers) check_resp(r) return r.text def get_search_data(html): if html is None or html == '': print("html为空") exit(1) html = etree.HTML(html) content_list = html.xpath("//div[@class='service-card-wrap']") if len(content_list) == 0: print("没有匹配到对应的数据") exit(2) search_infos = [] for content in content_list: info = parse_info(content) if info is not None: search_infos.append(info) return search_infos def parse_info(content): bot_content = content.xpath(".//div[@class='bot-content']") if len(bot_content) == 0: return None bot_content = bot_content[0] price = bot_content.xpath(".//div[@class='price']/span/text()") title = bot_content.xpath(".//div[@class='name-pic-box']/a/span/text()") key_word = bot_content.xpath(".//div[@class='name-pic-box']/a/span/h1/text()") final_title = [] key_work_len = len(key_word) for i, v in enumerate(title): final_title.append(v) if key_work_len > 0: if i >= key_word: continue final_title.append(key_word[i]) sales = bot_content.xpath(".//div[@class='descprit-box']/div[@class='sales']//span[@class='num']/text()") good_num = bot_content.xpath(".//div[@class='descprit-box']/div[@class='evaluate']/span[@class='num']/text()") shop_detail = content.xpath(".//div[@class='shop-detail']")[0] score = shop_detail.xpath(".//span[@class='shop-score']/text()") com_name = shop_detail.xpath(".//div[contains(@class, 'shop-info')]/text()") return { "title": "".join(final_title), "price": price[0], "sales": len(sales) == 0 and "暂无销量" or sales[0], "score": len(score) == 0 and "暂无评分" or score[0], "com_name": len(com_name) == 0 and "暂无商家" or com_name[0], "good_num": len(good_num) == 0 and "暂无好评" or good_num[0] } if __name__ == '__main__': key = "ERP" for info in get_search_data(search_key(key)): print(info)
# 获取猪八戒网的数据信息
import requests
from lxml import etree

from baidu_fanyi_test import check_resp

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "authority": 'www.zbj.com',
    "method": "GET",
}


def search_key(key) -> str:
    if key is None or key == '':
        print("请输入搜索关键字")
        return None
    req_url = f"https://www.zbj.com/fw/?k={key}"
    r = requests.get(req_url, headers=headers)
    check_resp(r)
    return r.text


def get_search_data(html):
    if html is None or html == '':
        print("html为空")
        exit(1)

    html = etree.HTML(html)
    content_list = html.xpath("//div[@class='service-card-wrap']")
    if len(content_list) == 0:
        print("没有匹配到对应的数据")
        exit(2)

    search_infos = []
    for content in content_list:
        info = parse_info(content)
        if info is not None:
            search_infos.append(info)

    return search_infos


def parse_info(content):
    bot_content = content.xpath(".//div[@class='bot-content']")
    if len(bot_content) == 0:
        return None
    bot_content = bot_content[0]
    price = bot_content.xpath(".//div[@class='price']/span/text()")
    title = bot_content.xpath(".//div[@class='name-pic-box']/a/span/text()")
    key_word = bot_content.xpath(".//div[@class='name-pic-box']/a/span/h1/text()")

    final_title = []
    key_work_len = len(key_word)

    for i, v in enumerate(title):
        final_title.append(v)
        if key_work_len > 0:
            if i >= key_word:
                continue
            final_title.append(key_word[i])

    sales = bot_content.xpath(".//div[@class='descprit-box']/div[@class='sales']//span[@class='num']/text()")
    good_num = bot_content.xpath(".//div[@class='descprit-box']/div[@class='evaluate']/span[@class='num']/text()")

    shop_detail = content.xpath(".//div[@class='shop-detail']")[0]
    score = shop_detail.xpath(".//span[@class='shop-score']/text()")
    com_name = shop_detail.xpath(".//div[contains(@class, 'shop-info')]/text()")

    return {
        "title": "".join(final_title),
        "price": price[0],
        "sales": len(sales) == 0 and "暂无销量" or sales[0],
        "score": len(score) == 0 and "暂无评分" or score[0],
        "com_name": len(com_name) == 0 and "暂无商家" or com_name[0],
        "good_num": len(good_num) == 0 and "暂无好评" or good_num[0]
    }


if __name__ == '__main__':
    key = "ERP"
    for info in get_search_data(search_key(key)):
        print(info)

以上代码通过查询ERP关键字,获取猪八戒网络上能够做ERP软件的服务提供上名称以及口碑信息,病讲对应的数据打印出来。以上代码仅供学习!

Leave a Comment

Comments

No comments yet. Why don’t you start the discussion?

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注