前景提要
HDC调试需求开发(15万预算),能者速来!>>> # -*- coding: utf-8 -*- import scrapy from scrapy.shell import inspect_response from scrapy.http import Request, FormRequest #from scrapy.selector import Selector import requests import time import json class ZhihuSpider(scrapy.Spider): name = 'zhihu' allowed_domains = ['zhihu.com'] start_urls = ['https://zhihu.com/'] referer_url = "https://www.zhihu.com/" login_url = "https://www.zhihu.com/login/email" check_login_url = "https://www.zhihu.com/settings/profile" login_formdata = { 'email': 'xxxx', 'password': 'eee' } headers = { "Accept": "*/*", "Accept-Encoding": "gzip,deflate", "Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4", "Connection": "keep-alive", "Upgrade-Insecure-Requests":"1", "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8", "X-Requested-With":"XMLHttpRequest", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36" } def start_requests(self): print('start_requests') self.headers['Referer'] = self.referer_url self.headers['Host'] = "www.zhihu.com" return [Request("https://www.zhihu.com", meta = {'cookiejar' : 1}, headers = self.headers,callback = self.post_login)] def captcha_handler(self, response): t = str(int(time.time() * 1000)) captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login" with open("code.jpg",'wb') as w: req=requests.Session() p=req.get(url=captcha_url,headers=self.headers) w.write(p.content) code=input("请输入验证码:") if not code: sys.exit(1) else: self.login_formdata["captcha"] = code def pre_signin_handler(self, response): #inspect_response(response,self) _xsrf = response.xpath('//input[@name="_xsrf"]/@value').extract_first() self.headers["X-Xsrftoken"] = _xsrf self.login_formdata['_xsrf'] = _xsrf print(self.login_formdata['_xsrf']) self.captcha_handler(response) def post_login(self, response): print('post_login') self.pre_signin_handler(response) print(self.login_formdata) #FormRequeset.from_response是Scrapy提供的一个函数, 用于post表单 return FormRequest("https://www.zhihu.com/login/email",meta={'cookiejar':response.meta['cookiejar']}, headers = self.headers, formdata = { '_xsrf':self.login_formdata['_xsrf'], 'password':'xxxx', 'email':'eeee', 'captcha':self.login_formdata["captcha"], }, callback = self.__check_login_status, ) ''' return [FormRequest.from_response(response, meta ={'cookiejar' : 1},#cookies = self.cookies, headers = self.headers, #注意此处的headers formdata = self.login_formdata, callback = self.__check_login_status, url = self.login_url, dont_filter = True )] ''' def __check_login_status(self, response): # '用来检测是否登陆成功' print("----__check_login_status----") print(eval(response.text)) if json.loads(response.text)['r'] == 0: print("登录成功") else: print("登录失败") #from scrapy.shell import inspect_response #inspect_response(response, self) yield Request(self.check_login_url, meta = {'cookiejar' : 1}, headers=self.headers,callback=self.parse_user_detail) def parse_user_detail(self, response): print("----parse_user_detail----") #with open('response_of_user_detil.html','wb') as file: # file.write(response.body)
用scrapy模拟登录知乎会出现验证码失败的情况,返回:
{'data': {'captcha': '验证码会话无效 :(', 'name': 'ERR_VERIFY_CAPTCHA_SESSION_IN
VALID'}, 'msg': '验证码会话无效 :(', 'r': 1, 'errcode': 1991829}
哪位大神有用scrapy模拟登录过?清指点下,谢谢!