今天的案例爬取大麦网搜索页:https://search.damai.cn/search.htm
1.使用分析方法
import requests
from lxml import etree
import csv
base_url = 'https://search.damai.cn/searchajax.html'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
data = []
page = input('请输入爬取页数:')
for p in range(int(page)):
params = (
("keyword",""),
( "cty",""),
("ctl",""),
("sctl",""),
("tsg","0"),
("st",""),
("et",""),
("order","1"),
("pageSize","30"),
("currPage",p),
("tn",""),
)
r= requests.get(url=base_url,headers=headers,params=params).json()
action_list = r["pageData"]["resultData"]
for i in action_list:
new_data = {}
new_data['categoryname'] = i['categoryname']
new_data['name'] = i['name']
new_data['price_str'] = ""+i['price_str']
new_data['showtime'] = i['showtime']
new_data['venue'] = i['venue']
data.append(new_data)
with open('damai.csv','a',encoding='utf-8',newline='')as f:
writer = csv.DictWriter(f,fieldnames = [ 'categoryname','name','price_str','showtime','venue'])
writer.writeheader()
writer.writerows(data)
学到的东西:
- 字典的使用,for k,v in dict.items(),遍历字典
- 字典的创建和添加字典的内容
- 追加的方式写入CSV,不必重复表头,要注意嵌套的逻辑
- CSV文件的写入
- 编码问题 open函数中encoding参数选择utf-8,写入数据无空行open函数newline参数=“”
- Ajax加载的网页分析法
2.使用selenium方法
from selenium import webdriver
import time
import requests
from lxml import etree
import pprint
# 实例化一个chrome浏览器对象
browser = webdriver.Chrome()
browser.get('https://search.damai.cn/search.htm')
tree = etree.HTML(browser.page_source)
title = tree.xpath('//div[@class="items__txt__title"]/a/text()')
link = tree.xpath('/html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div/div/div/div[1]/a/@href')
adress_content = tree.xpath('/html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div/div/div/div[2]/text()')
time.sleep(3)
data = list(zip(title,link,adress_content))
pprint.pprint(data)
#browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(3)
browser.back()
# 等待个3秒
time.sleep(3)
# 关闭网页
browser.quit()