爬虫实战-伪装网站爬取招聘信息

记录自己的成长

Posted by BY on May 12, 2019

废话不多说继续上源码

import requests
from lxml import etree
import pandas as pd
import time

#浏览器requests请求的header信息 用于伪装,信息越全越好
headers = {
    'accept': 'application/json, text/javascript, */*; q=0.01',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'cookie':'astCity=101210100; __c=1556606066; __g=-; __l=l=%2Fwww.zhipin.com%2F&r=https%3A%2F%2Fwww.google.com%2F; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1556606067; _uab_collina=155660607620379759699324; t=dhjAVYT92hMjBqrs; wt=dhjAVYT92hMjBqrs; __a=29958437.1556606066..1556606066.22.1.22.22; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1556615134',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest'

}

def get_info(page):
    for j in range(page):
        page = j+1
        ka = 'page'+'-'+str(j+1)
        url = 'https://www.zhipin.com/c101210100/?query=%E6%95%B0%E6%8D%AE&page={0}&ka={1}'.format(page, ka)
        print(url)
        response = requests.get(url, headers=headers).text
        #print(response)
        data = etree.HTML(response)
        result = []

        print("正在爬取第", j+1, "的数据")

        for i in range(30):
            index_page = data.xpath('//*[@id="main"]/div/div[3]/ul')

            pos = {}
            position = data.xpath('//*[@class="job-title"]/text()')[i]
            print(position, end='\t')
            salary = data.xpath('//*[@class="red"]/text()')[i]
            print(salary, end='\t')
            detal = data.xpath('//*[@class="info-primary"]/p')[i]
            addr = detal.xpath('string(.)')
            print(addr, end='\t')
            company = data.xpath('//*[@class="info-company"]/div/h3/a/text()')[i]
            print(company, end='\t')

            types = data.xpath('//*[@class="info-company"]/div/p')[i]
            type = types.xpath('string(.)')
            print(type, end='\t')

            hr_names = data.xpath('//*[@class="info-publis"]/h3')[i]
            hr_name = hr_names.xpath('string(.)')
            print(hr_name, end='\t')
            push_time = data.xpath('//*[@class="info-publis"]/p/text()')[i]
            print(push_time, end='\t')
            print('')
            pos['position'] = position
            pos['salary'] = salary
            pos['addr'] = addr
            pos['company'] = company
            pos['type'] = type
            pos['hr_name'] = hr_name
            pos['push_time'] = push_time
            result.append(pos)

            df = pd.DataFrame(result)

            if j == 0:
                df.to_csv('/Users/xxx/xxx/bigdatabook/boos.csv', mode='a')
            else:
                df.to_csv('/Users/xxx/xxx/bigdatabook/boos.csv', mode='a', header =None)



get_info(7)