记录一次爬虫例子
记录一次爬虫例子
ytkz爬虫
网络爬虫(又被称为网页蜘蛛,网络机器人)就是模拟浏览器发送网络请求,接收请求响应,一种按照一定的规则,自动地抓取互联网信息的程序。
原则上,只要是浏览器(客户端)能做的事情,爬虫都能够做。
准备
- 我们平时都说Python爬虫,其实这里可能有个误解,爬虫并不是Python独有的,可以做爬虫的语言有很多例如:PHP,JAVA,C#,C++,Python,选择Python做爬虫是因为Python相对来说比较简单,而且功能比较齐全。
例子
Pexels是图片库和素材库的提供商。它于 2014 年在德国成立,拥有一个拥有超过 320 万张免费照片和视频的图书馆。
之前需要在pexels这个无版权的视频网站下载视频。所以写了一个半自动的爬虫代码,语言是python。为什么叫做半自动呢?因为它还需要完善,但是对于目前的我来说,它是够用了。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/2/18 20:01
# @File : download_pexels.py
import requests
import os
import re, time
# 使用了VPN
proxies = {'http': 'socks5h://127.0.0.1:7890',
'https': 'socks5h://127.0.0.1:7890'
}
class Pexels():
# 整合Pexels API接口
api_key = ''
base_endpoint = ''
video_endpoint = ''
headers = ''
def __init__(self, api_key):
self.base_endpoint = 'https://api.pexels.com/v1/'
self.video_endpoint = 'https://api.pexels.com/videos/'
self.api_key = api_key
self.headers = {"Authorization": self.api_key}
def search_photos(self, query='ocean', orientation='', size='', color='', locale='', page=1, per_page=15):
term = 'search'
query = {'query': query, 'orientation': orientation, 'size': size,
'color': color, 'page': page, 'per_page': per_page}
photos = self.fetch_pexels(term, query, 'photo')
return photos
def curated_photos(self, page=1, per_page=15):
term = 'curated'
query = {'page': page, 'per_page': per_page}
curated = self.fetch_pexels(term, query, 'photo')
return curated
def get_photo(self, get_id):
term = 'photos'
query = {}
curated = self.fetch_pexels(term, query, 'photo', get_id)
return curated
def search_videos(self, query='ocean', orientation='', size='', color='', locale='', page=1, per_page=15):
term = 'search'
query = {'query': query, 'orientation': orientation, 'size': size,
'color': color, 'page': page, 'per_page': per_page}
videos = self.fetch_pexels(term, query, 'video')
return videos
def popular_videos(self, min_width='', min_height='', min_duration='', max_duration='', page=1, per_page=15):
term = 'popular'
query = {'min_width': min_width, 'min_height': min_height, 'min_duration': min_duration,
'max_duration': max_duration, 'page': page, 'per_page': per_page}
videos = self.fetch_pexels(term, query, 'video')
return videos
def get_video(self, get_id):
term = 'videos'
query = {}
curated = self.fetch_pexels(term, query, 'video', get_id)
return curated
def fetch_pexels(self, term, query, search_type, get_id=0):
proxies = {'http': 'socks5h://127.0.0.1:7890',
'https': 'socks5h://127.0.0.1:7890'
}
endpoint = ''
if search_type == 'photo':
endpoint = self.base_endpoint
elif search_type == 'video':
endpoint = self.video_endpoint
if get_id > 0:
response = requests.get(format('%s%s/%s' % (endpoint, term, get_id)),
headers=self.headers, proxies=proxies)
else:
response = requests.get(format('%s%s' % (endpoint, term)),
headers=self.headers, params=query, proxies= proxies)
return response.json()
class download_pexels:
def __init__(self, PEXELS_API, keyword, limit_num, outpath):
'''
下载器
Args:
PEXELS_API: api of pexels 自己的API代码
keyword: str 关键词
limit_num: int 每页的视频数,最大是80个
'''
self.PEXELS_API = PEXELS_API
self.keyword = keyword
self.limit_num = limit_num
self.outpath = outpath
def get_id(self, page):
pexel = Pexels(self.PEXELS_API)
# 我需要的是小尺寸的视频
search_videos_info = pexel.search_videos(self.keyword, orientation='', size='small', color='', locale='', page= page,
per_page=self.limit_num)
return search_videos_info
def read_already_download_files(self):
# 记录已下载的ID,避免重复下载
read_already_download_file = os.path.join(outpath, r'downloaded_files.txt')
if os.path.exists(read_already_download_file):
with open(read_already_download_file, 'rb') as f:
already_download_file_id = f.readlines()
already_download_file_id_list = [re.findall(r"\d+", str(x))[0] for x in already_download_file_id]
return already_download_file_id_list
else:
return []
def write_downloaded_files(self, id):
# 已下载就记录ID
read_already_download_file = os.path.join(outpath,r'downloaded_files.txt')
with open(read_already_download_file, 'a') as f:
f.write(str(id))
f.write('\n')
def download(self, page):
'''
Args:
outpath: dir of result files
Returns:
'''
videos = self.get_id(page)["videos"]
retrieved_videos = self.read_already_download_files()
for i in videos:
# 按照视频尺寸进行排序
if str(i["id"]) not in retrieved_videos:
# 找最小尺寸的索引
temp_ = []
for j in i["video_files"]:
temp_.append(j["width"])
ind = temp_.index(min(temp_))
url_video = i["video_files"][ind]["link"]
headers = {
'User-Agent': 'MMozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0'
}
print("star download %s" % str(i["id"]))
# r = s.get(url_video, headers=headers, proxies=proxies)
# r = s.get(url_video, headers=headers)
result_file = os.path.join(self.outpath, str(i["id"])+'.mp4')
# while os.path.exists()
# while repeat == 1:
repeat = download_file(url_video, result_file, proxies, headers)
time.sleep(2)
# 记录已下载
self.write_downloaded_files(i["id"])
else:
pass
def download_file(url, local_filename, proxies, headers):
# 分块下载
r = requests.get(url, stream=True,proxies=proxies, headers=headers)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
return local_filename
if __name__ == '__main__':
import random
PEXELS_API = r'' # 你在pexels注册后可以获得一个个人的API
keyword = 'rain' # 你要下载视频的主题
outpath = r'' # 输出目录
if os.path.exists(outpath) == 0:
os.makedirs(outpath)
limit_num = 80 # 一页最多有80个视频
for page in range(1, 3): # 下载第一页和第二页
download_pexels(PEXELS_API, keyword, limit_num, outpath).download(page)
主要使用requests库,然后去调用pexels API 获得视频的地址,然后再用requests分块下载。
小结
python在爬虫方面比较出名主要是python简单易学,其次是在于它的第三方库很好用,requests+BeautifulSoup可以爬虫静态网站,requests+selenium可以爬虫动态网站。
爬虫在遥感中的应用,一般是在爬取影像数据,比如去欧空局那边下载他们的sentinel系列影像,又比如说去美国的USGS下载Landsat系列影像,再比如说去下载日本的HIMAWARI 8 (葵花8)影像。