import requests
from lxml import etree
import pandas as pd
import time
#浏览器requests请求的header信息 用于伪装,信息越全越好
headers = {
'accept': 'application/json, text/javascript, */*; q=0.01',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'cookie':'astCity=101210100; __c=1556606066; __g=-; __l=l=%2Fwww.zhipin.com%2F&r=https%3A%2F%2Fwww.google.com%2F; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1556606067; _uab_collina=155660607620379759699324; t=dhjAVYT92hMjBqrs; wt=dhjAVYT92hMjBqrs; __a=29958437.1556606066..1556606066.22.1.22.22; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1556615134',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
def get_info(page):
for j in range(page):
page = j+1
ka = 'page'+'-'+str(j+1)
url = 'https://www.zhipin.com/c101210100/?query=%E6%95%B0%E6%8D%AE&page={0}&ka={1}'.format(page, ka)
print(url)
response = requests.get(url, headers=headers).text
#print(response)
data = etree.HTML(response)
result = []
print("正在爬取第", j+1, "的数据")
for i in range(30):
index_page = data.xpath('//*[@id="main"]/div/div[3]/ul')
pos = {}
position = data.xpath('//*[@class="job-title"]/text()')[i]
print(position, end='\t')
salary = data.xpath('//*[@class="red"]/text()')[i]
print(salary, end='\t')
detal = data.xpath('//*[@class="info-primary"]/p')[i]
addr = detal.xpath('string(.)')
print(addr, end='\t')
company = data.xpath('//*[@class="info-company"]/div/h3/a/text()')[i]
print(company, end='\t')
types = data.xpath('//*[@class="info-company"]/div/p')[i]
type = types.xpath('string(.)')
print(type, end='\t')
hr_names = data.xpath('//*[@class="info-publis"]/h3')[i]
hr_name = hr_names.xpath('string(.)')
print(hr_name, end='\t')
push_time = data.xpath('//*[@class="info-publis"]/p/text()')[i]
print(push_time, end='\t')
print('')
pos['position'] = position
pos['salary'] = salary
pos['addr'] = addr
pos['company'] = company
pos['type'] = type
pos['hr_name'] = hr_name
pos['push_time'] = push_time
result.append(pos)
df = pd.DataFrame(result)
if j == 0:
df.to_csv('/Users/xxx/xxx/bigdatabook/boos.csv', mode='a')
else:
df.to_csv('/Users/xxx/xxx/bigdatabook/boos.csv', mode='a', header =None)
get_info(7)