爬虫
- 在Gogle浏览器上安装Xpath Helper插件
- 实例:爬图书的价格,排序等
import requests
from lxml import html
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def spider_dangdang(isbn):
book_list = []
# 目标站点地址
url = 'http://search.dangdang.com/?key={}&act=input'.format(isbn)
# print(url)
# 获取站点str类型的响应
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp = requests.get(url, headers=headers)
html_data = resp.text
# 将html页面写入本地
# with open('dangdang.html', 'w', encoding='utf-8') as f:
# f.write(html_data)
# 提取目标站的信息
selector = html.fromstring(html_data)
ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
print('您好,共有{}家店铺售卖此图书'.format(len(ul_list)))
# print('qwertyui', ul_list)
# 遍历 ul_list
for li in ul_list:
# 图书名称
title = li.xpath('./a/@title')[0].strip()
print('title---', title)
# 图书购买链接
link = li.xpath('a/@href')[0]
# print(link)
# 图书价格
price = li.xpath('./p[@class="price"]/span[@class="search_now_price"]/text()')[0]
#print(price)
price = float(price.replace('¥',''))
print(price)
# 图书卖家名称
store = li.xpath('./p[@class="search_shangjia"]/a/text()')
store = '当当自营' if len(store) == 0 else store[0]
# print(store)
# 添加每一个商家的图书信息
book_list.append({
'title':title,
'price':price,
'link':link,
'store':store
})
# 按照价格进行排序
book_list.sort(key=lambda x:x['price'])
# 遍历booklist
for book in book_list:
print(book)
# 展示价格最低的前10家 柱状图
# 店铺的名称
top10_store = [book_list[i] for i in range(10)]
# x = []
# for store in top10_store:
# x.append(store['store'])
x = [x['store'] for x in top10_store]
print(x)
# 图书的价格
y = [x['price'] for x in top10_store]
print(y)
# plt.bar(x, y)
plt.barh(x, y)
plt.show()
# 存储成csv文件
df = pd.DataFrame(book_list)
df.to_csv('dangdang.csv')
spider_dangdang('9787115428028')
- 作业:爬电影网站,得到电影名、想看人数等信息,绘制分析图
import requests
from lxml import html
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def spider_douban():
movie_list = []
# 目标站点地址
url = 'https://movie.douban.com/cinema/later/chongqing/?qq-pf-to=pcqq.group'
print(url)
# 添加头文件,伪装成浏览器,防止被发现
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp = requests.get(url, headers=headers)
html_data = resp.text
selector = html.fromstring(html_data)
ul_list = selector.xpath("//div[@id='showing-soon']/div")
print('重庆即将上映的电影有',len(ul_list),'部') #打印有多少部电影
# 遍历 ul_list
for li in ul_list:
i = 0
# i = 0
# 电影名
name_movie = li.xpath("./div/h3/a/text()")[0]
# print('电影名', name_movie)
# 上映日期
line_data = li.xpath("./div/ul/li[1]/text()")[i]
# print('上映日期', line_data)
# 类型
movie_type = li.xpath("./div/ul/li[2]/text()")[i]
# print('电影类型', movie_type)
# 上映国家
movie_city = li.xpath("./div/ul/li[3]/text()")[i]
# print('上映国家', movie_city)
# 想看人数
movie_wantnum = li.xpath("./div/ul/li/span/text()")[i]
# print('想看人数', movie_wantnum)
movie_wantnum = int(movie_wantnum.replace('人想看', ''))
movie_list.append({
'电影名': name_movie,
'上映日期': line_data,
'类型': movie_type,
'上映国家': movie_city,
'想看人数': movie_wantnum,
})
i += 1
movie_list.sort(key=lambda a: a['想看人数'], reverse=True)
print(movie_list)
movie_city = []
for i in movie_list:
movie_city.append(i['上映国家'])
print('上映国家', movie_city)
################# 绘制上映国家云词
text = ' '.join(movie_city)
print(text)
from wordcloud import WordCloud
import imageio
mask = imageio.imread('./image/china.jpg')
WordCloud(
font_path='msyh.ttc',
background_color='black',
width=800,
height=600,
collocations=False, # 相邻两个重复词之间的匹配
mask=mask
).generate(text).to_file('上映国家.png')
# 绘制 上映国家占比
from random import randint
from matplotlib import pyplot as plt
# 解决乱码
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import numpy as np
counts = {} # 上映国家和人数
lli = [] # 上映国家
lii = [] # 每个国家出现的次数
for i in movie_city:
counts[i] = counts.get(i, 0)+1
print(counts)
items = list(counts.items()) # 将字典转换为列表
print(len(items))
items.sort(key=lambda x: x[1], reverse=True) # 使用函数从大到小排序
for i in range(len(items)): # 有多少个国家循环多少次,解包 出国家和国家的次数
role, count = items[i] # 序列解包
lli.append(count)
lii.append(role)
# for _ in range(count):
# li.append(role)
# print(li)
print(lii)
print(lli)
explode = [0.1, 0, 0, 0]
plt.pie(lli, explode=explode, shadow=True, labels=lii, autopct='%1.1f%%')
# top5_store = [movie_list[i] for i in range(5)]
name_movie = [] #电影名
for i in movie_list:
name_movie.append(i['电影名'])
if len(name_movie) >= 5:
break
print('电影名', name_movie)
# 想看人数
movie_wantnum = []
for i in movie_list:
movie_wantnum.append(i['想看人数'])
if len(movie_wantnum) >= 5:
break
print('想看人数', movie_wantnum)
plt.barh(name_movie, movie_wantnum)
plt.show()
spider_douban()
# # # 获取str类型的响应
# # print(response.text)
# # # 获取bytes类型的响应
# # print(response.content)
# # # 获取响应头
# # print(response.headers)
# # # 获取状态码
# # print(response.status_code)
上映国家云图.PNG
上映国家占比.PNG
想看人数TOP5电影.PNG