Edit 2
Second approach. For now, I gave up on using multiple instances and configured scrapy settings not to use concurrent requests. It's slow but stable. I opened a bounty. Who can help to make this work concurrently? If I configure scrapy to run concurrently, I get segmentation faults.
class WebkitDownloader( object ):
def __init__(self):
os.environ["DISPLAY"] = ":99"
self.proxyAddress = "a:b@" + PROXY_DEFAULT_HOST + ":" + str(PROXY_DEFAULT_PORT)
def process_response(self, request, response, spider):
self.request = request
self.response = response
if 'cached' not in response.flags:
webkitBrowser = webkit.WebkitBrowser(proxy = self.proxyAddress, gui=False, timeout=0.5, delay=0.5, forbidden_extensions=['js','css','swf','pdf','doc','xls','ods','odt'])
#print "added to queue: " + str(self.counter)
webkitBrowser.get(html=response.body, num_retries=0)
html = webkitBrowser.current_html()
respcls = responsetypes.from_args(headers=response.headers, url=response.url)
kwargs = dict(cls=respcls, body=killgremlins(html))
response = response.replace(**kwargs)
webkitBrowser.setPage(None)
del webkitBrowser
return response
Edit:
I tried to answer my own question in the meantime and implemented a queue but it does not run asynchronously for some reason. Basically when webkitBrowser.get(html=response.body, num_retries=0) is busy, scrapy is blocked until the method is finished. New requests are not assigned to the remaining free instances in self.queue.
Can anyone please point me into right direction to make this work?
class WebkitDownloader( object ):
def __init__(self):
proxyAddress = "http://" + PROXY_DEFAULT_HOST + ":" + str(PROXY_DEFAULT_PORT)
self.queue = list()
for i in range(8):
self.queue.append(webkit.WebkitBrowser(proxy = proxyAddress, gui=True, timeout=0.5, delay=5.5, forbidden_extensions=['js','css','swf','pdf','doc','xls','ods','odt']))
def process_response(self, request, response, spider):
i = 0
for webkitBrowser in self.queue:
i += 1
if webkitBrowser.status == "WAITING":
break
webkitBrowser = self.queue[i]
if webkitBrowser.status == "WAITING":
# load webpage
print "added to queue: " + str(i)
webkitBrowser.get(html=response.body, num_retries=0)
webkitBrowser.scrapyResponse = response
while webkitBrowser.status == "PROCESSING":
print "waiting for queue: " + str(i)
if webkitBrowser.status == "DONE":
print "fetched from queue: " + str(i)
#response = webkitBrowser.scrapyResponse
html = webkitBrowser.current_html()
respcls = responsetypes.from_args(headers=response.headers, url=response.url)
kwargs = dict(cls=respcls, body=killgremlins(html))
#response = response.replace(**kwargs)
webkitBrowser.status = "WAITING"
return response
I am using WebKit in a scrapy middleware to render JavaScript. Currently, scrapy is configured to process 1 request at a time (no concurrency).
I'd like to use concurrency (e.g. 8 requests at a time) but then I need to make sure that 8 instances of WebkitBrowser() receive requests based on their individual processing state (a fresh request as soon as WebkitBrowser.get() is done and ready to receive the next request)
How would I achieve that with Python? This is my current middleware:
class WebkitDownloader( object ):
def __init__(self):
proxyAddress = "http://" + PROXY_DEFAULT_HOST + ":" + str(PROXY_DEFAULT_PORT)
self.w = webkit.WebkitBrowser(proxy = proxyAddress, gui=True, timeout=0.5, delay=0.5, forbidden_extensions=['js','css','swf','pdf','doc','xls','ods','odt'])
def process_response(self, request, response, spider):
if not ".pdf" in response.url:
# load webpage
self.w.get(html=response.body, num_retries=0)
html = self.w.current_html()
respcls = responsetypes.from_args(headers=response.headers, url=response.url)
kwargs = dict(cls=respcls, body=killgremlins(html))
response = response.replace(**kwargs)
return response