Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

from time import time 

from urlparse import urlparse, urlunparse, urldefrag 

 

from twisted.python import failure 

from twisted.web.client import HTTPClientFactory 

from twisted.web.http import HTTPClient 

from twisted.internet import defer 

 

from scrapy.http import Headers 

from scrapy.utils.httpobj import urlparse_cached 

from scrapy.responsetypes import responsetypes 

from scrapy import optional_features 

 

 

def _parsed_url_args(parsed): 

    path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, '')) 

    host = parsed.hostname 

    port = parsed.port 

    scheme = parsed.scheme 

    netloc = parsed.netloc 

    if port is None: 

        port = 443 if scheme == 'https' else 80 

    return scheme, netloc, host, port, path 

 

 

def _parse(url): 

    url = url.strip() 

    parsed = urlparse(url) 

    return _parsed_url_args(parsed) 

 

 

class ScrapyHTTPPageGetter(HTTPClient): 

 

    delimiter = '\n' 

 

    def connectionMade(self): 

        self.headers = Headers() # bucket for response headers 

 

        # Method command 

        self.sendCommand(self.factory.method, self.factory.path) 

        # Headers 

        for key, values in self.factory.headers.items(): 

            for value in values: 

                self.sendHeader(key, value) 

        self.endHeaders() 

        # Body 

        if self.factory.body is not None: 

            self.transport.write(self.factory.body) 

 

    def lineReceived(self, line): 

        return HTTPClient.lineReceived(self, line.rstrip()) 

 

    def handleHeader(self, key, value): 

        self.headers.appendlist(key, value) 

 

    def handleStatus(self, version, status, message): 

        self.factory.gotStatus(version, status, message) 

 

    def handleEndHeaders(self): 

        self.factory.gotHeaders(self.headers) 

 

    def connectionLost(self, reason): 

        HTTPClient.connectionLost(self, reason) 

        self.factory.noPage(reason) 

 

    def handleResponse(self, response): 

        if self.factory.method.upper() == 'HEAD': 

            self.factory.page('') 

        else: 

            self.factory.page(response) 

        self.transport.loseConnection() 

 

    def timeout(self): 

        self.transport.loseConnection() 

        self.factory.noPage(\ 

                defer.TimeoutError("Getting %s took longer than %s seconds." % \ 

                (self.factory.url, self.factory.timeout))) 

 

 

class ScrapyHTTPClientFactory(HTTPClientFactory): 

    """Scrapy implementation of the HTTPClientFactory overwriting the 

    serUrl method to make use of our Url object that cache the parse  

    result. 

    """ 

 

    protocol = ScrapyHTTPPageGetter 

    waiting = 1 

    noisy = False 

    followRedirect = False 

    afterFoundGet = False 

 

    def __init__(self, request, timeout=180): 

        self.url = urldefrag(request.url)[0] 

        self.method = request.method 

        self.body = request.body or None 

        self.headers = Headers(request.headers) 

        self.response_headers = None 

        self.timeout = request.meta.get('download_timeout') or timeout 

        self.start_time = time() 

        self.deferred = defer.Deferred().addCallback(self._build_response, request) 

 

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected 

        # to have _disconnectedDeferred. See Twisted r32329. 

        # As Scrapy implements it's own logic to handle redirects is not 

        # needed to add the callback _waitForDisconnect. 

        # Specifically this avoids the AttributeError exception when 

        # clientConnectionFailed method is called. 

        self._disconnectedDeferred = defer.Deferred() 

 

        self._set_connection_attributes(request) 

 

        # set Host header based on url 

        self.headers.setdefault('Host', self.netloc) 

 

        # set Content-Length based len of body 

        if self.body is not None: 

            self.headers['Content-Length'] = len(self.body) 

            # just in case a broken http/1.1 decides to keep connection alive 

            self.headers.setdefault("Connection", "close") 

 

    def _build_response(self, body, request): 

        request.meta['download_latency'] = self.headers_time-self.start_time 

        status = int(self.status) 

        headers = Headers(self.response_headers) 

        respcls = responsetypes.from_args(headers=headers, url=self.url) 

        return respcls(url=self.url, status=status, headers=headers, body=body) 

 

    def _set_connection_attributes(self, request): 

        parsed = urlparse_cached(request) 

        self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) 

        proxy = request.meta.get('proxy') 

        if proxy: 

            self.scheme, _, self.host, self.port, _ = _parse(proxy) 

            self.path = self.url 

 

    def gotHeaders(self, headers): 

        self.headers_time = time() 

        self.response_headers = headers 

 

 

 

146if 'ssl' in optional_features: 

    from twisted.internet.ssl import ClientContextFactory 

    from OpenSSL import SSL 

else: 

    ClientContextFactory = object 

 

 

class ScrapyClientContextFactory(ClientContextFactory): 

    "A SSL context factory which is more permissive against SSL bugs." 

    # see https://github.com/scrapy/scrapy/issues/82 

    # and https://github.com/scrapy/scrapy/issues/26 

 

    def getContext(self): 

        ctx = ClientContextFactory.getContext(self) 

        # Enable all workarounds to SSL bugs as documented by 

        # http://www.openssl.org/docs/ssl/SSL_CTX_set_options.html 

        ctx.set_options(SSL.OP_ALL) 

        return ctx