对伯乐在线所有文章进行爬取
使用scrapy框架
jobbolen.py
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request from urllib import parse from ScrapyText.items import Article_Item class JobbolenSpider(scrapy.Spider): name = 'jobbolen' allowed_domains = ['blog.jobbole.com'] start_urls = ['http://blog.jobbole.com/all-posts/'] def parse(self, response): re_nodes= response.css('#archive .floated-thumb .post-thumb a') for re_node in re_nodes: image_url=re_node.css("img::attr(src)").extract_first() re_url=re_node.css('::attr(href)').extract_first() yield Request(url=parse.urljoin(response.url,re_url),meta={'front_url_image':image_url},callback=self.text_parse)#yield交给scrapy进行自动下载 next_urls=response.css('.next.page-numbers::attr(href)').extract_first() if next_urls: yield Request(url=parse.urljoin(response.url, re_url), callback=self.parse) def text_parse(self,response): article_item=Article_Item() re_title = response.css('.entry-header h1::text').extract()[0] re_text = response.css('.entry p::text').extract() front_image=response.meta.get("front_url_image","") article_item["Title"]=re_title article_item["Text"]=re_text article_item["Front_image"]=front_image yield article_item items.py配置
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class ScrapytextItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass class Article_Item(scrapy.Item): Title=scrapy.Field() Text=scrapy.Field() Front_image=scrapy.Field() Front_image_path=scrapy.Field()
setting.py配置 import os
ROBOTSTXT_OBEY = False
IMAGES_URLS_FIELD ="Front_image"#从item中找出那个是要保存的 project_dir=os.path.abspath(os.path.dirname(__file__)) IMAGES_STORE=os.path.join(project_dir,'images')#将图片保存在本地文件中 main.py
# -*- coding: utf-8 -*- __auther__="booby" from scrapy.cmdline import execute import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) execute(["scrapy","crawl","jobbolen"]) 运行出现错误:
解决方案:
由于将一个字符串传递给数组导致错误
将jobbolen.py中的front_image改成[front_image]
运行结果
提取出文章及标题和封面图片