创建scrapy项目
scrapy startproject 项目名称
scrapy startproject lottery
进入项目终端
cd 项目名称
cd lottery
创建爬虫
scrapy genspider 名字 域名
scrapy genspider pl3 m.55123.cn
可能需要修改start_urls成你要爬取的页面
对数据进行解析
def parse(self,response):
response.text() 拿页面源代码
response.xpath()
response.css()
解析数据 默认xpath()返回的selector对象
想要数据必须使用extract()提取
extract_first() 返回一个数据
yield 返回数据 把数据交给pipeline进行持久化存储
pl3.py
import scrapy
import warnings
warnings.filterwarnings("ignore") #去除不影响程序运行的警告
from lottery.items import PaiLieSan
class Pl3Spider(scrapy.Spider):
name = "pl3"
allowed_domains = ["m.55123.cn"]
start_urls = ["https://www.55123.cn/zs/p3_14.html?record=800"]
def parse(self, resp):
tr_list = resp.xpath('//tbody[@id="chartData"]/tr')
for tr in tr_list:
pls_qh = tr.xpath('./td[1]/text()').extract()
pls_num = tr.xpath('./td[@class="bg-yellow red"]/text()').extract()
#print(pls_qh,pls_num)
pl3 = PaiLieSan()
pl3['pl3_qh'] = pls_qh
pl3['pl3_num'] = pls_num
yield pl3
items.py
import scrapy
class PaiLieSan(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pl3_qh = scrapy.Field()
pl3_num = scrapy.Field()
在pipeline完成数据的存储
class 类名():
def process_item(self,item,spider):
item:数据
spider:爬虫
return item
import pymysql
from lottery.settings import MYSQL
#排列三
class PL3MySQLPipeline:
def open_spider(self,spider):
self.conn = pymysql.connect(
host=MYSQL['host'],
port=MYSQL['port'],
user=MYSQL['user'],
password=MYSQL['password'],
database=MYSQL['database']
)
def process_item(self, item, spider):
try:
cursor =self.conn.cursor()
sql = "insert into PL3 (期号,中奖号码) values (%s,%s)"
cursor.execute(sql,(item['pl3_qh'],"_".join(item['pl3_num'])))
self.conn.commit()
except:
self.conn.rollback()
finally:
if cursor:
cursor.close()
def close_spider(self,spider):
if self.conn:
self.conn.close()
开启管道
设置settings,py文件将pipeline进行生效设置
ITEM_PIPELINES= {
'管道的路径':优先级
}
ITEM_PIPELINES = {
"lottery.pipelines.PL3MySQLPipeline": 300,
}
配置MySQL
#配置mysql
MYSQL = {
"host": "localhost",
"port": 3306,
"user": "root",
"password": "root",
"database": 'test'
}
Robot协议
ROBOTSTXT_OBEY = False
终端运行爬虫
scrapy crawl 爬虫的名字
PS C:\Users\HP\PycharmProjects\pythonProject\爬虫\scrapy入门> cd lottery
PS C:\Users\HP\PycharmProjects\pythonProject\爬虫\scrapy入门\lottery> scrapy crawl pl3
数据库查询结果
精彩文章
发表评论