数据挖掘--爬虫--Ajax数据爬取

Ajax

  Ajax,全程Asynchronous JavaScript and XML,即异步的JavaScript和XML,在保证页面不被刷新、页面链接不改变的情况下与服务器交换数据并更新部分网页的技术。

爬取今日头条街拍

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author zhangbohan.dell@gmail.com
@function:
@create 2019/2/25 11:12
"""
import os
from hashlib import md5
from multiprocessing.pool import Pool

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode


def get_page(offset):
params = {
'aid': '24',
'app_name': 'web_search',
'offset': offset,
'format': 'json',
'keyword': '街拍',
'autoload': 'true',
'count': '20',
'en_qc': '1',
'cur_tab': '1',
'from': 'search_tab',
'pd': 'syntheis'
}
url = 'https://www.toutiao.com/api/search/content/?' + urlencode(params)
try:
response = requests.get(url)
if response.status_code == 200:
return response.json()
except requests.ConnectionError:
return None


def get_image(json):
if json.get('data'):
for item in json.get('data'):
title = item.get('title')
images = item.get('image_list')
for image in images:
yield {
'image': image.get('url'),
'title': title
}


def save_image(item):
if not os.path.exists(item.get('title')):
os.mkdir(item.get('title'))
try:
response = requests.get(item.get('image'))
if response.status_code == 200:
file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(response.content)
else:
print('Already Downloaded', file_path)
except requests.ConnectionError:
print('Failed to save Image')


def main(offset):
json = get_page(offset)
for item in get_image(json):
print(item)
save_image(item)


GROUP_START = 1
GROUP_END = 20

if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)
pool.map(main, groups)
pool.close()
pool.join()

参考资料

崔庆才大佬的《python3网络爬虫开发实战》