请求一页的数据
from urllib.parse import urlencode
from fake_useragent import UserAgent
import requests
base_url = 'https://weixin.sogou.com/weixin?'
for i in range(10):
ua = UserAgent()
headers = { 'User-Agent': ua.chrome,
'Cookie': 'SUID=E524B5754A42910A000000005C486694; GOTO=Af99047; SNUID=110C47812B2EA82D933368D72BFC4F7C; SUID=3B276DAB3320910A000000005C791795; SUV=1551439765053021; ABTEST =0|1551439780|v1; weixinIndexVisited=1; sct=2; IPLOC=US; JSESSIONID=aaaV_HTrCV-bs5r5SmZKw',
'DNT': '1',
'Host': 'weixin.sogou.com',
'Referer': 'https://weixin.sogou.com/weixin?query=%E9%A3%8E%E6%99%AF&_sug_type_=&sut=10105&lkt=1%2C1551444385751%2C1551444385751&s_from=input&_sug_=y&type=2&sst0=1551444385855&page=4&ie=utf8&w=01019900&dr=1',
'Upgrade-Insecure-Requests': '1'
}
def get_html(url):
try:
response = requests.get(url, allow_redirects=False, headers=headers) #allow_redirects=False阻止状态码跳转
if response.status_code == 200:
return response.text
if response.status_code == 302:
#Need Proxy
pass
except ConnectionError:
return get_html(url) #重新请求
def get_index(keyword, page):
data = {
'query': keyword,
'_sug_type_': '',
'sut': '10105',
'lkt': '1',
'lkt': '1551444385751',
'lkt': '1551444385751',
's_from': 'input',
'_sug_': 'y',
'type': '2',
'sst0': '1551444385855',
'page': page,
'ie': 'utf8',
'w': '01019900',
'dr': '1'
}
queries = urlencode(data)
url = base_url + queries
html = get_html(url)
print(html)
if __name__ == '__main__':
get_index('风景',1)
或者:
from urllib.parse import urlencode
import requests
# from fake_useragent import UserAgent
base_url = 'https://weixin.sogou.com/weixin?'
headers = {
'Cookie': 'SUID=AD16F3684842910A000000005C8BBA37; SUID=AD16F3685F20940A000000005C8DC9DD; SUV=1552796127398265; weixinIndexVisited=1; IPLOC=CN4503; SNUID=90BCA5D6A8A221264F4A3753A819E9E7; ld=Slllllllll2tHVOAlllllVh1QQwlllllty1Unlllll9lllllRZlll5@@@@@@@@@@; LSTMV=513%2C66; LCLKINT=7794; ABTEST=0|1555323243|v1; sct=4; JSESSIONID=aaa5eAFG-dttfa3oafQOw; ppinf=5|1555489939|1556699539|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxODolRTYlOTclQUQlRTklOUMlODF8Y3J0OjEwOjE1NTU0ODk5Mzl8cmVmbmljazoxODolRTYlOTclQUQlRTklOUMlODF8dXNlcmlkOjQ0Om85dDJsdUhrWUMyd0pnSWltZV9uUWlpejMxVlFAd2VpeGluLnNvaHUuY29tfA; pprdig=VcvZyhwxPcKDiwxCwyw2Koh-Uv1YIPN-5G8q3E80pe6DG6QkpohqbvKPtVOCFIKEicErxAvHReq7mi_gcmksxeKFksKKI3oObOQ91d2W-rZ2S_OuSCBBT3w_WQJIFLXm29LuaWUEY4Og8qtFOqLFgpeS80WhdALO1xkagTvwTBc; sgid=01-38033375-AVy25JNahCvzudZ4dtHBSF8; ppmdig=15554899390000009d2b45d27208d0d2841bb54b91a41816',
'Host': 'weixin.sogou.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
# 'User-Agent': UserAgent().random
}
def get_html(url):
try:
response = requests.get(url, allow_redirects=False, headers=headers)
if response.status_code ==200:
return response.text
if response.status_code == 302:
# Need Proxy
pass
except ConnectionError:
return get_html(url)
def get_index(keyword, page):
data = {
'query': keyword,
'type': 2,
'page': page
}
queries = urlencode(data)
url = base_url + queries
html = get_html(url)
print(html)
if __name__ == '__main__':
get_index('风景', 1)
韧桂 2019-03-02