Python爬虫——scrapy-3

免责声明

任务

文件简介

爬取当当网内容单管道

pipelines.py

items.py

setting

dang.py

当当网多管道下载图片

pipelines.py

settings

当当网多页下载

dang.py

pielines.py

settings

items.py

总结

免责声明

该文章用于学习，无任何商业用途

文章部分图片来自尚硅谷

任务

爬取当当网汽车用品_汽车用品【价格品牌推荐正品折扣】-当当网页面的全部商品数据

文件简介

在Scrapy框架中，pipelines和items都是用于处理和存储爬取到的数据的工具。

Items：Items是用于存储爬取到的数据的容器。它类似于一个Python字典，可以存储各种字段和对应的值。在Scrapy中，你可以定义一个自己的Item类，然后在爬虫中创建Item对象，并将爬取到的数据填充到Item对象中。Items可以在爬取过程中传递给pipelines进行进一步处理和存储。 Pipelines：Pipelines是用于处理和存储Item对象的组件。当爬虫爬取到数据后，它会将数据填充到Item对象中，并通过Pipeline进行处理和存储。Pipeline可以在爬取过程中执行各种操作，比如数据的清洗、去重、验证、存储等。你可以定义多个Pipeline，并按优先级顺序执行它们。

在我们的这个项目中就需要用到了

爬取当当网内容单管道

下面的图片来自尚硅谷

pipelines.py

# Define your item pipelines here

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# useful for handling different item types with a single interface

from itemadapter import ItemAdapter

# 如果想使用管道的话，就要在setting中开启

class ScrapyDangdang060Pipeline:

"""

在爬虫文件开始之前执行

"""

def open_spider(self, spider):

print("++++++++++=========")

self.fp = open('book.json', 'w', encoding='utf-8')

# item 就是yield后面的book对象

# book = ScrapyDangdang060Pipeline(src=src, name=name, price=price)

def process_item(self, item, spider):

# TODO 一下这种方法并不推荐，因为每传递过来一个对象就打开一个文件

# TODO 对文件的操作过于频繁

# (1) write方法必须是字符串，而不能是其他的对象

# w 会每一个对象都打开一次文件，然后后一个文件会将前一个文件覆盖

# with open('book.json', 'a', encoding='utf-8') as fp:

# fp.write(str(item))

# todo 这样就解决了文件的打开过于频繁

self.fp.write(str(item))

return item

"""

在爬虫文件执行完成之后执行

"""

def close_spider(self, spider):

print("------------------==========")

self.fp.close()

在setting中解除注释，开启pipelines

ITEM_PIPELINES = {

# 管道可以有很多个，管道也存在优先级，范围1~1000，值越小，优先级越高

"scrapy_dangdang_060.pipelines.ScrapyDangdang060Pipeline": 300,

}

items.py

# Define here the models for your scraped items

# See documentation in:

# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class ScrapyDangdang060Item(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

# 通俗的说就是我们要下载的数据都有什么

# 图片

src = scrapy.Field()

# 名字

name = scrapy.Field()

# 价格

price = scrapy.Field()

setting

# Scrapy settings for scrapy_dangdang_060 project

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

# https://docs.scrapy.org/en/latest/topics/settings.html

# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "scrapy_dangdang_060"

SPIDER_MODULES = ["scrapy_dangdang_060.spiders"]

NEWSPIDER_MODULE = "scrapy_dangdang_060.spiders"

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = "scrapy_dangdang_060 (+http://www.yourdomain.com)"

# Obey robots.txt rules

ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",

# "Accept-Language": "en",

# Enable or disable spider middlewares

# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

# "scrapy_dangdang_060.middlewares.ScrapyDangdang060SpiderMiddleware": 543,

# Enable or disable downloader middlewares

# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

# "scrapy_dangdang_060.middlewares.ScrapyDangdang060DownloaderMiddleware": 543,

# Enable or disable extensions

# See https://docs.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

# "scrapy.extensions.telnet.TelnetConsole": None,

# Configure item pipelines

# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# 管道可以有很多个，管道也存在优先级，范围1~1000，值越小，优先级越高

ITEM_PIPELINES = {

"scrapy_dangdang_060.pipelines.ScrapyDangdang060Pipeline": 300,

}

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://docs.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = "httpcache"

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value

REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"

TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

FEED_EXPORT_ENCODING = "utf-8"

dang.py

import scrapy

# 这里报错是编译器的问题，但是并不影响下面的代码

from scrapy_dangdang_060.items import ScrapyDangdang060Item

class DangSpider(scrapy.Spider):

name = "dang"

allowed_domains = ["category.dangdang.com"]

start_urls = ["https://category.dangdang.com/cid4002429.html"]

def parse(self, response):

print("===============成功================")

# pipelines 管道用于下载数据

# items 定义数据结构的

# src = //ul[@id="component_47"]/li//img/@src

# alt = //ul[@id="component_47"]/li//img/@alt

# price = //ul[@id="component_47"]/li//p/span/text()

# 所有的seletor的对象都可以再次调用xpath

li_list = response.xpath('//ul[@id="component_47"]/li')

for li in li_list:

# 这里页面使用了懒加载，所以不能使用src了

src = li.xpath('.//a//img/@data-original').extract_first()

# 前几张图片的和其他图片你的标签属性并不一样

# 第一章图片的src是可以使用的，其他的图片的地址是data-original

if src:

src = src

else:

src = li.xpath('.//a//img/@src').extract_first()

name = li.xpath('.//img/@alt').extract_first()

# /span/text()

price = li.xpath('.//p[@class="price"]/span[1]/text()').extract_first()

# print(src, name, price)

book = ScrapyDangdang060Item(src=src, name=name, price=price)

# 获取一个book就将book交给pipelines

yield book

这样之后就可以拿下book.json也就是当当网这一页的全部json数据了。

当当网多管道下载图片

# （1）定义管道类

# （2）在settings中开启管道

# "scrapy_dangdang_060.pipelines.DangDangDownloadPipeline": 301,

pipelines.py

# Define your item pipelines here

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# useful for handling different item types with a single interface

from itemadapter import ItemAdapter

# 如果想使用管道的话，就要在setting中开启

class ScrapyDangdang060Pipeline:

"""

在爬虫文件开始之前执行

"""

def open_spider(self, spider):

print("++++++++++=========")

self.fp = open('book.json', 'w', encoding='utf-8')

# item 就是yield后面的book对象

# book = ScrapyDangdang060Pipeline(src=src, name=name, price=price)

def process_item(self, item, spider):

# TODO 一下这种方法并不推荐，因为每传递过来一个对象就打开一个文件

# TODO 对文件的操作过于频繁

# (1) write方法必须是字符串，而不能是其他的对象

# w 会每一个对象都打开一次文件，然后后一个文件会将前一个文件覆盖

# with open('book.json', 'a', encoding='utf-8') as fp:

# fp.write(str(item))

# todo 这样就解决了文件的打开过于频繁

self.fp.write(str(item))

return item

"""

在爬虫文件执行完成之后执行

"""

def close_spider(self, spider):

print("------------------==========")

self.fp.close()

import urllib.request

# 多条管道开启

# （1）定义管道类

# （2）在settings中开启管道

# "scrapy_dangdang_060.pipelines.DangDangDownloadPipeline": 301,

class DangDangDownloadPipeline:

def process_item(self, item, spider):

url = 'https:' + item.get('src')

filename = './books/' + item.get('name') + '.jpg'

urllib.request.urlretrieve(url=url, filename=filename)

return item

settings

ITEM_PIPELINES = {

"scrapy_dangdang_060.pipelines.ScrapyDangdang060Pipeline": 300,

# DangDangDownloadPipeline

"scrapy_dangdang_060.pipelines.DangDangDownloadPipeline": 301,

}

就修改这两处文件，其他的无需变化

当当网多页下载

dang.py

这里寻找了一下不同page页之间的区别，然后使用parse方法来爬取数据

import scrapy

# 这里报错是编译器的问题，但是并不影响下面的代码

from scrapy_dangdang_060.items import ScrapyDangdang060Item

class DangSpider(scrapy.Spider):

name = "dang"

# 如果要多页爬取，那么需要调整allowed_domains的范围一般情况下只写域名

allowed_domains = ["category.dangdang.com"]

start_urls = ["https://category.dangdang.com/cid4002429.html"]

base_url = 'https://category.dangdang.com/pg'

page = 1

def parse(self, response):

print("===============成功================")

# pipelines 管道用于下载数据

# items 定义数据结构的

# src = //ul[@id="component_47"]/li//img/@src

# alt = //ul[@id="component_47"]/li//img/@alt

# price = //ul[@id="component_47"]/li//p/span/text()

# 所有的seletor的对象都可以再次调用xpath

li_list = response.xpath('//ul[@id="component_47"]/li')

for li in li_list:

# 这里页面使用了懒加载，所以不能使用src了

src = li.xpath('.//a//img/@data-original').extract_first()

# 前几张图片的和其他图片你的标签属性并不一样

# 第一章图片的src是可以使用的，其他的图片的地址是data-original

if src:

src = src

else:

src = li.xpath('.//a//img/@src').extract_first()

name = li.xpath('.//img/@alt').extract_first()

# /span/text()

price = li.xpath('.//p[@class="price"]/span[1]/text()').extract_first()

# print(src, name, price)

book = ScrapyDangdang060Item(src=src, name=name, price=price)

# 获取一个book就将book交给pipelines

yield book

# 每一页的爬取逻辑都是一样的，所以我们只需要将执行的那个页的请求再次调用parse方法即可

# 第一页：https://category.dangdang.com/cid4002429.html

# 第二页：https://category.dangdang.com/pg2-cid4002429.html

# 第三页：https://category.dangdang.com/pg3-cid4002429.html

if self.page < 100:

self.page = self.page + 1

url = self.base_url + str(self.page) + '-cid4002429.html'

# 调用parse方法

# 下面的代码就是scrapy的get请求

# 这里的parse千万不要加括号()

yield scrapy.Request(url=url, callback=self.parse)

pielines.py

# Define your item pipelines here

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# useful for handling different item types with a single interface

from itemadapter import ItemAdapter

# 如果想使用管道的话，就要在setting中开启

class ScrapyDangdang060Pipeline:

"""

在爬虫文件开始之前执行

"""

def open_spider(self, spider):

print("++++++++++=========")

self.fp = open('book.json', 'w', encoding='utf-8')

# item 就是yield后面的book对象

# book = ScrapyDangdang060Pipeline(src=src, name=name, price=price)

def process_item(self, item, spider):

# TODO 一下这种方法并不推荐，因为每传递过来一个对象就打开一个文件

# TODO 对文件的操作过于频繁

# (1) write方法必须是字符串，而不能是其他的对象

# w 会每一个对象都打开一次文件，然后后一个文件会将前一个文件覆盖

# with open('book.json', 'a', encoding='utf-8') as fp:

# fp.write(str(item))

# todo 这样就解决了文件的打开过于频繁

self.fp.write(str(item))

return item

"""

在爬虫文件执行完成之后执行

"""

def close_spider(self, spider):

print("------------------==========")

self.fp.close()

import urllib.request

# 多条管道开启

# （1）定义管道类

# （2）在settings中开启管道

# "scrapy_dangdang_060.pipelines.DangDangDownloadPipeline": 301,

class DangDangDownloadPipeline:

def process_item(self, item, spider):

url = 'https:' + item.get('src')

filename = './books/' + item.get('name') + '.jpg'

urllib.request.urlretrieve(url=url, filename=filename)

return item

settings

# Scrapy settings for scrapy_dangdang_060 project

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

# https://docs.scrapy.org/en/latest/topics/settings.html

# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "scrapy_dangdang_060"

SPIDER_MODULES = ["scrapy_dangdang_060.spiders"]

NEWSPIDER_MODULE = "scrapy_dangdang_060.spiders"

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = "scrapy_dangdang_060 (+http://www.yourdomain.com)"

# Obey robots.txt rules

ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",

# "Accept-Language": "en",

# Enable or disable spider middlewares

# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

# "scrapy_dangdang_060.middlewares.ScrapyDangdang060SpiderMiddleware": 543,

# Enable or disable downloader middlewares

# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

# "scrapy_dangdang_060.middlewares.ScrapyDangdang060DownloaderMiddleware": 543,

# Enable or disable extensions

# See https://docs.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

# "scrapy.extensions.telnet.TelnetConsole": None,

# Configure item pipelines

# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# 管道可以有很多个，管道也存在优先级，范围1~1000，值越小，优先级越高

ITEM_PIPELINES = {

"scrapy_dangdang_060.pipelines.ScrapyDangdang060Pipeline": 300,

# DangDangDownloadPipeline

"scrapy_dangdang_060.pipelines.DangDangDownloadPipeline": 301,

}

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://docs.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = "httpcache"

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value

REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"

TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

FEED_EXPORT_ENCODING = "utf-8"

items.py

# Define here the models for your scraped items

# See documentation in:

# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class ScrapyDangdang060Item(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

# 通俗的说就是我们要下载的数据都有什么

# 图片

src = scrapy.Field()

# 名字

name = scrapy.Field()

# 价格

price = scrapy.Field()

总结

虽然难，但是男人不能说这难┭┮﹏┭┮

ヾ(￣▽￣)Bye~Bye~

金钥匙

Python爬虫——scrapy-3

03 python38的scrapy处理json格式数据

scrapy nginx ja3指纹笔记

发表评论取消回复

金钥匙

Python爬虫——scrapy-3

03 python38的scrapy处理json格式数据

scrapy nginx ja3指纹 笔记

相关文章

发表评论取消回复

scrapy nginx ja3指纹笔记