Python实现爬取赶集网10万二手商品数据

发布 : 2017-04-22 分类 : Python 浏览 :

1.启动MongoDB

1
mongod

Markdown

2.生成数据

ganjiSpider.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# -*- coding: utf-8 -*-

# 三个方法
# 一个获取赶集二手交易网下的二级链接,并保存至MongoDB中
# 一个获取赶集二手交易网二级页面下的所有子页面链接,并保存至MongoDB中
# 一个获取详情页面信息,并保存至MongoDB中
# 一个运行入口

# 为了操作MongoDB数据库,需要引入pymongo库
import pymongo
import time
import urllib.request
from bs4 import BeautifulSoup
from multiprocessing import Pool

# 连接MongoDB,数据库地址为:localhost,端口号为:27017
client = pymongo.MongoClient('localhost', 27017)

# 从MongoDB中选择名称为ganji的数据库
ganji = client['ganji']

# 从ganji数据库选择名称为url_list的表
url_list = ganji['url_list']

# 从ganji数据库选择名称为item_info的表
item_info = ganji['item_info']

start_url = 'http://bj.ganji.com/wu

'
url_host = 'http://bj.ganji.com/

'


# 1.获取赶集二手交易网下的二级链接
def get_index_url(url):
# wb_data = requests.get(url)
# soup = BeautifulSoup(wb_data.text, 'lxml')
# print(soup.original_encoding)

# 设置from_encoding='utf8'以解决在python3下BeautifulSoup的中文乱码问题
wb_data = urllib.request.urlopen(url)
soup = BeautifulSoup(wb_data, 'lxml', from_encoding="utf8")

links = soup.select('.fenlei > dt > a')
for link in links:
page_url = url_host + link.get('href')[1:]
print(page_url)


# get_index_url(start_url)

# 赶集二手交易网下二级链接
channel_list = '''http://bj.ganji.com/jiaju/

http://bj.ganji.com/rirongbaihuo/

http://bj.ganji.com/shouji/

http://bj.ganji.com/shoujihaoma/

http://bj.ganji.com/bangong/

http://bj.ganji.com/nongyongpin/

http://bj.ganji.com/jiadian/

http://bj.ganji.com/ershoubijibendiannao/

http://bj.ganji.com/ruanjiantushu/

http://bj.ganji.com/yingyouyunfu/

http://bj.ganji.com/diannao/

http://bj.ganji.com/xianzhilipin/

http://bj.ganji.com/fushixiaobaxuemao/

http://bj.ganji.com/meironghuazhuang/

http://bj.ganji.com/shuma/

http://bj.ganji.com/laonianyongpin/

http://bj.ganji.com/xuniwupin/

http://bj.ganji.com/qitawupin/

http://bj.ganji.com/ershoufree/

http://bj.ganji.com/wupinjiaohuan/

'''


# spider1
def get_links_from(channel, pages):
list_view = '{}o{}/'.format(channel, str(pages))
print(list_view)
wb_data = urllib.request.urlopen(list_view)
# 设置时间延迟
time.sleep(1)
soup = BeautifulSoup(wb_data, 'lxml', from_encoding='utf8')
if soup.find('td'):
for i in soup.select('.zzinfo td.t a'):
data = {
"topic": channel[20:-1],
"url": i.get('href')
}
url_list.insert_one(data)
else:
# It's the last page !
pass


# spider2
# 爬取子页面中的详情信息
def get_items_info(topic,url):
wb_data = urllib.request.urlopen(url)
# 设置时间延迟
time.sleep(1)
soup = BeautifulSoup(wb_data, 'lxml', from_encoding='utf8')
if soup.find('h1'):
title = soup.select('.info_titile')[0].text
price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')[0].text
watch = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')[0].text
localtion = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')[0].text
desc = soup.select('.baby_kuang p')[0].text
data = {
"topic":topic,
"title": title,
"url":url,
"price":price,
"watch":watch,
"localtion":localtion,
"desc":desc
}
print(data)
item_info.insert_one(data)
else:
pass

# get_items_info("jiaju","http://zhuanzhuan.ganji.com/detail/864382127585951752z.shtml?from=pc&source=ganji&cate=%E5%8C%97%E4%BA%AC%E8%B5%B6%E9%9B%86%7C%E5%8C%97%E4%BA%AC%E4%BA%8C%E6%89%8B%7C%E5%8C%97%E4%BA%AC%E4%BA%8C%E6%89%8B%E7%AC%94%E8%AE%B0%E6%9C%AC&cateurl=bj|wu|ershoubijibendiannao&gj_other_ifid=from_ganji&gj_other_city_id=12&gj_other_gc_1=wu&gj_other_uuid=&gj_other_ca_s=&gj_other_ca_kw=&gj_other_ca_n=&gj_other_ca_i=&gj_other_sid=

")

# 获取赶集二手交易网二级页面下的所有子页面链接,并保存至MongoDB中
def get_all_links_from(channel):
for num in range(1, 1000):
get_links_from(channel, num)

def To_get_items_info():
for i in url_list.find():
topic = i['topic']
url = i['url']
get_items_info(topic,url)

# if __name__ == '__main__':
# # 启动多线程爬取数据
# # print(channel_list.split("\n"))
# pool = Pool()
# pool.map(get_all_links_from, channel_list.split("\n"))

# To_get_items_info()

3.进入MongoDB命令行

1
mongo

Markdown

4.统计ganji数据库中表的数据条数

Markdown

本文作者 : Matrix
原文链接 : https://matrixsparse.github.io/2017/04/22/Python实现爬取赶集网10万二手商品数据/
版权声明 : 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明出处!

知识 & 情怀 | 二者兼得

微信扫一扫, 向我投食

微信扫一扫, 向我投食

支付宝扫一扫, 向我投食

支付宝扫一扫, 向我投食

留下足迹