python - Proper way to run mutiple scrapy spiders -
i tried running multiple spiders in same process using the new scrapy documentation buti getting: attributeerror: 'crawlerprocess' object has no attribute 'crawl'
i found this post same problem tried using code 0.24 documentation , got: runspider: error: unable load 'price_comparator.py': no module named testspiders.spiders.followall
for 1.0 imported:
from scrapy.crawler import crawlerprocess
and 0.24 imported:
from twisted.internet import reactor scrapy.crawler import crawler scrapy import log testspiders.spiders.followall import followallspider scrapy.utils.project import get_project_settings
based on 0.24 doc, seems code runs 1 spider through multiple domains, doesn't seem same 1.0 doc's code does, ran both anyway. have code run both spiders inside of file spiders defined problem. there internal issue new version of code or there dependency or code i'm missing program? have file code both docs below (i didn't run both versions @ same time).
spider class definitions
relevant code @ bottom
import re import json import scrapy scrapy import request scrapy.contrib.spiders import crawlspider , rule scrapy.selector import htmlxpathselector scrapy.selector import selector scrapy.contrib.loader import itemloader scrapy.contrib.loader import xpathitemloader scrapy.contrib.loader.processor import join, mapcompose concert_comparator.items import comparatoritem, comparatoritem2 twisted.internet import reactor scrapy.crawler import crawlerprocess #from scrapy.crawler import crawler scrapy import log #from testspiders.spiders.followall import followallspider scrapy.utils.project import get_project_settings urlparse import urljoin bandname = raw_input("enter bandname \n") #location = raw_input("enter city \n") vs_url = "http://www.vividseats.com/concerts/" + bandname + "-tickets.html" sg_url = "http://www.seatgeek.com/" + bandname + "-tickets" #sh_url = "http://www.stubhub.com/" + bandname + "-tickets/" #print sh_url #rules = (rule(linkextractor(allow=("concerts/" + bandname + "-tickets/" + bandname + "-" + item["ticketslink"]),restrict_xpaths=('.//*/td[3]/a/@href',))callback = "parse_tickets" , follow = true,)) class myspider(crawlspider): handle_httpstatus_list = [416] name = 'comparator' allowed_domains = ["www.vividseats.com"] start_urls = [vs_url] tickets_list_xpath = './/*[@itemtype="http://schema.org/event"]' def parse_json(self, response): loader = response.meta['loader'] jsonresponse = json.loads(response.body_as_unicode()) ticket_info = jsonresponse.get('tickets') price_list = [i.get('p') in ticket_info] ticketprice = ''.join(price_list[0]) loader.add_value('ticketprice', ticketprice) return loader.load_item() def parse_price(self, response): loader = response.meta['loader'] ticketslink = loader.get_output_value("ticketslink") json_id_list= re.findall(r"(\d+)[^-]*$", ticketslink) json_id= "".join(json_id_list) json_url = "http://www.vividseats.com/javascript/tickets.shtml?productionid=" + json_id yield scrapy.request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = true) def parse(self, response): """ # """ selector = htmlxpathselector(response) # iterate on tickets ticket in selector.select(self.tickets_list_xpath): loader = xpathitemloader(comparatoritem(), selector=ticket) # define loader loader.default_input_processor = mapcompose(unicode.strip) loader.default_output_processor = join() # iterate on fields , add xpaths loader loader.add_xpath('eventname' , './/*[@class="productionsevent"]/text()') loader.add_xpath('eventlocation' , './/*[@class = "productionsvenue"]/span[@itemprop = "name"]/text()') loader.add_xpath('ticketslink' , './/*/a[@class = "btn btn-primary"]/@href') loader.add_xpath('eventdate' , './/*[@class = "productionsdate"]/text()') loader.add_xpath('eventcity' , './/*[@class = "productionsvenue"]/span[@itemprop = "address"]/span[@itemprop = "addresslocality"]/text()') loader.add_xpath('eventstate' , './/*[@class = "productionsvenue"]/span[@itemprop = "address"]/span[@itemprop = "addressregion"]/text()') loader.add_xpath('eventtime' , './/*[@class = "productionstime"]/text()') print "here ticket link \n" + loader.get_output_value("ticketslink") #sel.xpath("//span[@id='practitionerdetails1_label4']/text()").extract() ticketsurl = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketslink") ticketsurl = urljoin(response.url, ticketsurl) yield scrapy.request(ticketsurl, meta={'loader': loader}, callback = self.parse_price, dont_filter = true) class myspider2(crawlspider): handle_httpstatus_list = [416] name = 'comparator2' allowed_domains = ["www.seatgeek.com/"] start_urls = [sg_url] tickets_list_xpath = './/*[@itemtype="http://schema.org/event"]' def parse_json2(self, response): loader = response.meta['loader'] jsonresponse = json.loads(response.body_as_unicode()) listings_info = jsonresponse.get('listings') price_list = [i.get('pf') in ticket_info] ticketprice = price_list[0] loader.add_value('ticketprice', ticketprice) return loader.load_item() def parse_price2(self, response): loader = response.meta['loader'] ticketslink = loader.get_output_value("ticketslink") json_id= ticketslink.split('/')[6] json_url = "https://seatgeek.com/listings?client_id=mty2mnwxmzgzmziwmtu4&id=" + json_id + "&_wt=1&&_=1436364489501" yield scrapy.request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = true) def parse2(self, response): """ # """ selector = htmlxpathselector(response) # iterate on tickets ticket in selector.select(self.tickets_list_xpath): loader = xpathitemloader(comparatoritem(), selector=ticket) # define loader loader.default_input_processor = mapcompose(unicode.strip) loader.default_output_processor = join() # iterate on fields , add xpaths loader loader.add_xpath('eventname' , './/a[@class = "event-listing-title"]/span[@itemprop = "name"]/text()') loader.add_xpath('eventlocation' , './/a[@class = "event-listing-venue-link"]/span[@itemprop = "name"]/text()') loader.add_xpath('ticketslink' , '//a[@class = "event-listing-button"]/@href') loader.add_xpath('eventdate' , '//div[@class = "event-listing-date"]/text()') loader.add_xpath('eventcity' , './/span[@itemprop = "addresslocality"]/text()') loader.add_xpath('eventstate' , './/span[@itemprop = "addressregion"]/text()') loader.add_xpath('eventcountry' , './/span[@itemprop = "addresscountry"]/text()') loader.add_xpath('eventtime' , '//div[@class = "event-listing-time"]/text()') #ticketsurl = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketslink") tickets_url = "www.seatgeek.com/" + loader.get_output_value("ticketslink") #ticketsurl = urljoin(response.url, ticketsurl) yield scrapy.request(tickets_url, meta={'loader': loader}, callback = self.parse_price2, dont_filter = true) #0.24 code # def setup_crawler(domain): # spider = followallspider(domain=domain) # settings = get_project_settings() # crawler = crawler(settings) # crawler.configure() # crawler.crawl(spider) # crawler.start() # domain in [vs_url, sg_url]: # setup_crawler(domain) # log.start() # reactor.run() #1.0 code process = crawlerprocess(get_project_settings()) process = crawlerprocess({ 'user_agent' : 'mozilla/4.0 (compartible; msie 7.0; windows nt 5.1)' }) process.crawl(myspider) process.crawl(myspider2) process.start()
Comments
Post a Comment