python - Proper way to run mutiple scrapy spiders -


i tried running multiple spiders in same process using the new scrapy documentation buti getting: attributeerror: 'crawlerprocess' object has no attribute 'crawl'

i found this post same problem tried using code 0.24 documentation , got: runspider: error: unable load 'price_comparator.py': no module named testspiders.spiders.followall

for 1.0 imported:

from scrapy.crawler import crawlerprocess 

and 0.24 imported:

from twisted.internet import reactor scrapy.crawler import crawler scrapy import log testspiders.spiders.followall import followallspider scrapy.utils.project import get_project_settings 

based on 0.24 doc, seems code runs 1 spider through multiple domains, doesn't seem same 1.0 doc's code does, ran both anyway. have code run both spiders inside of file spiders defined problem. there internal issue new version of code or there dependency or code i'm missing program? have file code both docs below (i didn't run both versions @ same time).

spider class definitions

relevant code @ bottom

import re import json import scrapy scrapy import request scrapy.contrib.spiders import crawlspider , rule scrapy.selector import htmlxpathselector scrapy.selector import selector scrapy.contrib.loader import itemloader scrapy.contrib.loader import xpathitemloader scrapy.contrib.loader.processor import join, mapcompose concert_comparator.items import comparatoritem, comparatoritem2 twisted.internet import reactor  scrapy.crawler import crawlerprocess #from scrapy.crawler import crawler scrapy import log #from testspiders.spiders.followall import followallspider scrapy.utils.project import get_project_settings  urlparse import urljoin  bandname = raw_input("enter bandname \n") #location = raw_input("enter city \n") vs_url = "http://www.vividseats.com/concerts/" + bandname + "-tickets.html" sg_url = "http://www.seatgeek.com/" + bandname + "-tickets" #sh_url = "http://www.stubhub.com/" + bandname + "-tickets/" #print sh_url #rules = (rule(linkextractor(allow=("concerts/" + bandname + "-tickets/" + bandname + "-" + item["ticketslink"]),restrict_xpaths=('.//*/td[3]/a/@href',))callback = "parse_tickets" , follow = true,)) class myspider(crawlspider):     handle_httpstatus_list = [416]     name = 'comparator'     allowed_domains = ["www.vividseats.com"]     start_urls = [vs_url]     tickets_list_xpath = './/*[@itemtype="http://schema.org/event"]'      def parse_json(self, response):         loader = response.meta['loader']         jsonresponse = json.loads(response.body_as_unicode())         ticket_info = jsonresponse.get('tickets')         price_list = [i.get('p') in ticket_info]         ticketprice = ''.join(price_list[0])         loader.add_value('ticketprice', ticketprice)         return loader.load_item()      def parse_price(self, response):         loader = response.meta['loader']         ticketslink = loader.get_output_value("ticketslink")         json_id_list= re.findall(r"(\d+)[^-]*$", ticketslink)         json_id=  "".join(json_id_list)         json_url = "http://www.vividseats.com/javascript/tickets.shtml?productionid=" + json_id         yield scrapy.request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = true)       def parse(self, response):         """         # """         selector = htmlxpathselector(response)         # iterate on tickets         ticket in selector.select(self.tickets_list_xpath):             loader = xpathitemloader(comparatoritem(), selector=ticket)             # define loader             loader.default_input_processor = mapcompose(unicode.strip)             loader.default_output_processor = join()             # iterate on fields , add xpaths loader             loader.add_xpath('eventname' , './/*[@class="productionsevent"]/text()')             loader.add_xpath('eventlocation' , './/*[@class = "productionsvenue"]/span[@itemprop  = "name"]/text()')             loader.add_xpath('ticketslink' , './/*/a[@class = "btn btn-primary"]/@href')             loader.add_xpath('eventdate' , './/*[@class = "productionsdate"]/text()')             loader.add_xpath('eventcity' , './/*[@class = "productionsvenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addresslocality"]/text()')             loader.add_xpath('eventstate' , './/*[@class = "productionsvenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addressregion"]/text()')             loader.add_xpath('eventtime' , './/*[@class = "productionstime"]/text()')              print "here ticket link \n" + loader.get_output_value("ticketslink")             #sel.xpath("//span[@id='practitionerdetails1_label4']/text()").extract()             ticketsurl = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketslink")             ticketsurl = urljoin(response.url, ticketsurl)             yield scrapy.request(ticketsurl, meta={'loader': loader}, callback = self.parse_price, dont_filter = true)   class myspider2(crawlspider):     handle_httpstatus_list = [416]     name = 'comparator2'     allowed_domains = ["www.seatgeek.com/"]     start_urls = [sg_url]     tickets_list_xpath = './/*[@itemtype="http://schema.org/event"]'      def parse_json2(self, response):         loader = response.meta['loader']         jsonresponse = json.loads(response.body_as_unicode())         listings_info = jsonresponse.get('listings')         price_list = [i.get('pf') in ticket_info]         ticketprice = price_list[0]         loader.add_value('ticketprice', ticketprice)         return loader.load_item()      def parse_price2(self, response):         loader = response.meta['loader']         ticketslink = loader.get_output_value("ticketslink")         json_id= ticketslink.split('/')[6]         json_url = "https://seatgeek.com/listings?client_id=mty2mnwxmzgzmziwmtu4&id=" + json_id + "&_wt=1&&_=1436364489501"         yield scrapy.request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = true)       def parse2(self, response):         """          # """         selector = htmlxpathselector(response)         # iterate on tickets         ticket in selector.select(self.tickets_list_xpath):             loader = xpathitemloader(comparatoritem(), selector=ticket)             # define loader             loader.default_input_processor = mapcompose(unicode.strip)             loader.default_output_processor = join()             # iterate on fields , add xpaths loader             loader.add_xpath('eventname' , './/a[@class = "event-listing-title"]/span[@itemprop = "name"]/text()')             loader.add_xpath('eventlocation' , './/a[@class = "event-listing-venue-link"]/span[@itemprop = "name"]/text()')             loader.add_xpath('ticketslink' , '//a[@class = "event-listing-button"]/@href')             loader.add_xpath('eventdate' , '//div[@class = "event-listing-date"]/text()')             loader.add_xpath('eventcity' , './/span[@itemprop  = "addresslocality"]/text()')             loader.add_xpath('eventstate' , './/span[@itemprop  = "addressregion"]/text()')             loader.add_xpath('eventcountry' , './/span[@itemprop  = "addresscountry"]/text()')             loader.add_xpath('eventtime' , '//div[@class = "event-listing-time"]/text()')              #ticketsurl = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketslink")             tickets_url = "www.seatgeek.com/" + loader.get_output_value("ticketslink")             #ticketsurl = urljoin(response.url, ticketsurl)             yield scrapy.request(tickets_url, meta={'loader': loader}, callback = self.parse_price2, dont_filter = true) #0.24 code   # def setup_crawler(domain): #     spider = followallspider(domain=domain) #     settings = get_project_settings() #     crawler = crawler(settings) #     crawler.configure() #     crawler.crawl(spider) #     crawler.start()  # domain in [vs_url, sg_url]: #     setup_crawler(domain) # log.start() # reactor.run()  #1.0 code  process = crawlerprocess(get_project_settings()) process = crawlerprocess({     'user_agent' : 'mozilla/4.0 (compartible; msie 7.0; windows nt 5.1)'     }) process.crawl(myspider)         process.crawl(myspider2) process.start() 


Comments

Popular posts from this blog

python - No exponential form of the z-axis in matplotlib-3D-plots -

php - Best Light server (Linux + Web server + Database) for Raspberry Pi -

c# - "Newtonsoft.Json.JsonSerializationException unable to find constructor to use for types" error when deserializing class -