2 获取网页信息
- def get_message():
- """
- 获取网页相关信息
- """
- failed_list = []
- for i in range(1, 101):
- print('第' + str(i) + '页')
- print(float(random.randint(15, 20)))
- # 设置延时,这里是度娘查到的,说要设置15s延迟以上,不会被封
- time.sleep(float(random.randint(15, 20)))
- # 每10页换一次SNUID值
- if (i-1) % 10 == 0:
- value = get_cookies_snuid()
- snuid = 'SNUID=' + value + ';'
- # 设置Cookies
- cookies = cookie + snuid
- url = 'http://weixin.sogou.com/weixin?query=python&type=2&page=' + str(i) + '&ie=utf8'
- host = cookies + 'n'
- header = head + host
- headers = str_to_dict(header)
- # 设置代理IP
- proxies = get_proxies(i)
- try:
- response = requests.get(url=url, headers=headers, proxies=proxies)
- html = response.text
- soup = BeautifulSoup(html, 'html.parser')
- data = soup.find_all('ul', {'class': 'news-list'})
- lis = data[0].find_all('li')
- for j in (range(len(lis))):
-
- h3 = lis[j].find_all('h3')
- #print(h3[0].get_text().replace('n', ''))
- title = h3[0].get_text().replace('n', '').replace(',', ',')
-
- p = lis[j].find_all('p')
- #print(p[0].get_text())
- article = p[0].get_text().replace(',', ',')
-
- a = lis[j].find_all('a', {'class': 'account'})
- #print(a[0].get_text())
- name = a[0].get_text()
-
- span = lis[j].find_all('span', {'class': 's2'})
- cmp = re.findall("d{10}", span[0].get_text())
- #print(time.strftime("%Y-%m-%d", time.localtime(int(cmp[0]))) + 'n')
- date = time.strftime("%Y-%m-%d", time.localtime(int(cmp[0])))
-
- with open('sg_articles.csv', 'a+', encoding='utf-8-sig') as f:
- f.write(title + ',' + article + ',' + name + ',' + date + 'n')
- print('第' + str(i) + '页成功')
- except Exception as e:
- print('第' + str(i) + '页失败')
- failed_list.append(i)
- continue
- # 获取失败页码
- print(failed_list)
-
-
- def main():
- get_message()
-
-
- if __name__ == '__main__':
- main()
(编辑:PHP编程网 - 黄冈站长网)
【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!
|