Coverage for scrapy/core/downloader/webclient: 96%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

from time import time

from urlparse import urlparse, urlunparse, urldefrag

from twisted.python import failure

from twisted.web.client import HTTPClientFactory

from twisted.web.http import HTTPClient

from twisted.internet import defer

from scrapy.http import Headers

from scrapy.utils.httpobj import urlparse_cached

from scrapy.responsetypes import responsetypes

from scrapy import optional_features

def _parsed_url_args(parsed):

path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))

host = parsed.hostname

port = parsed.port

scheme = parsed.scheme

netloc = parsed.netloc

if port is None:

port = 443 if scheme == 'https' else 80

return scheme, netloc, host, port, path

def _parse(url):

url = url.strip()

parsed = urlparse(url)

return _parsed_url_args(parsed)

class ScrapyHTTPPageGetter(HTTPClient):

delimiter = '\n'

def connectionMade(self):

self.headers = Headers() # bucket for response headers

# Method command

self.sendCommand(self.factory.method, self.factory.path)

# Headers

for key, values in self.factory.headers.items():

for value in values:

self.sendHeader(key, value)

self.endHeaders()

# Body

if self.factory.body is not None:

self.transport.write(self.factory.body)

def lineReceived(self, line):

return HTTPClient.lineReceived(self, line.rstrip())

def handleHeader(self, key, value):

self.headers.appendlist(key, value)

def handleStatus(self, version, status, message):

self.factory.gotStatus(version, status, message)

def handleEndHeaders(self):

self.factory.gotHeaders(self.headers)

def connectionLost(self, reason):

HTTPClient.connectionLost(self, reason)

self.factory.noPage(reason)

def handleResponse(self, response):

if self.factory.method.upper() == 'HEAD':

self.factory.page('')

else:

self.factory.page(response)

self.transport.loseConnection()

def timeout(self):

self.transport.loseConnection()

self.factory.noPage(\

defer.TimeoutError("Getting %s took longer than %s seconds." % \

(self.factory.url, self.factory.timeout)))

class ScrapyHTTPClientFactory(HTTPClientFactory):

"""Scrapy implementation of the HTTPClientFactory overwriting the

serUrl method to make use of our Url object that cache the parse

result.

"""

protocol = ScrapyHTTPPageGetter

waiting = 1

noisy = False

followRedirect = False

afterFoundGet = False

def __init__(self, request, timeout=180):

self.url = urldefrag(request.url)[0]

self.method = request.method

self.body = request.body or None

self.headers = Headers(request.headers)

self.response_headers = None

self.timeout = request.meta.get('download_timeout') or timeout

self.start_time = time()

self.deferred = defer.Deferred().addCallback(self._build_response, request)

# Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected

# to have _disconnectedDeferred. See Twisted r32329.

# As Scrapy implements it's own logic to handle redirects is not

# needed to add the callback _waitForDisconnect.

# Specifically this avoids the AttributeError exception when

# clientConnectionFailed method is called.

self._disconnectedDeferred = defer.Deferred()

self._set_connection_attributes(request)

# set Host header based on url

self.headers.setdefault('Host', self.netloc)

# set Content-Length based len of body

if self.body is not None:

self.headers['Content-Length'] = len(self.body)

# just in case a broken http/1.1 decides to keep connection alive

self.headers.setdefault("Connection", "close")

def _build_response(self, body, request):

request.meta['download_latency'] = self.headers_time-self.start_time

status = int(self.status)

headers = Headers(self.response_headers)

respcls = responsetypes.from_args(headers=headers, url=self.url)

return respcls(url=self.url, status=status, headers=headers, body=body)

def _set_connection_attributes(self, request):

parsed = urlparse_cached(request)

self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)

proxy = request.meta.get('proxy')

if proxy:

self.scheme, _, self.host, self.port, _ = _parse(proxy)

self.path = self.url

def gotHeaders(self, headers):

self.headers_time = time()

self.response_headers = headers

146if 'ssl' in optional_features:

from twisted.internet.ssl import ClientContextFactory

from OpenSSL import SSL

else:

ClientContextFactory = object

class ScrapyClientContextFactory(ClientContextFactory):

"A SSL context factory which is more permissive against SSL bugs."

# see https://github.com/scrapy/scrapy/issues/82

# and https://github.com/scrapy/scrapy/issues/26

def getContext(self):

ctx = ClientContextFactory.getContext(self)

# Enable all workarounds to SSL bugs as documented by

# http://www.openssl.org/docs/ssl/SSL_CTX_set_options.html

ctx.set_options(SSL.OP_ALL)

return ctx

Coverage for scrapy/core/downloader/webclient : 96%

99 statements 95 run 4 missing 0 excluded 1 partial