Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

""" 

This module provides some useful functions for working with 

scrapy.http.Request objects 

""" 

 

import hashlib 

import weakref 

from urlparse import urlunparse 

 

from twisted.internet.defer import Deferred 

from w3lib.http import basic_auth_header 

 

from scrapy.utils.url import canonicalize_url 

from scrapy.utils.httpobj import urlparse_cached 

 

 

_fingerprint_cache = weakref.WeakKeyDictionary() 

def request_fingerprint(request, include_headers=None): 

    """ 

    Return the request fingerprint. 

     

    The request fingerprint is a hash that uniquely identifies the resource the 

    request points to. For example, take the following two urls: 

     

    http://www.example.com/query?id=111&cat=222 

    http://www.example.com/query?cat=222&id=111 

 

    Even though those are two different URLs both point to the same resource 

    and are equivalent (ie. they should return the same response). 

 

    Another example are cookies used to store session ids. Suppose the 

    following page is only accesible to authenticated users: 

     

    http://www.example.com/members/offers.html 

 

    Lot of sites use a cookie to store the session id, which adds a random 

    component to the HTTP Request and thus should be ignored when calculating 

    the fingerprint.  

     

    For this reason, request headers are ignored by default when calculating 

    the fingeprint. If you want to include specific headers use the 

    include_headers argument, which is a list of Request headers to include. 

 

    """ 

    if include_headers: 

        include_headers = tuple([h.lower() for h in sorted(include_headers)]) 

    cache = _fingerprint_cache.setdefault(request, {}) 

    if include_headers not in cache: 

        fp = hashlib.sha1() 

        fp.update(request.method) 

        fp.update(canonicalize_url(request.url)) 

        fp.update(request.body or '') 

        if include_headers: 

            for hdr in include_headers: 

                if hdr in request.headers: 

                    fp.update(hdr) 

                    for v in request.headers.getlist(hdr): 

                        fp.update(v) 

        cache[include_headers] = fp.hexdigest() 

    return cache[include_headers] 

 

def request_authenticate(request, username, password): 

    """Autenticate the given request (in place) using the HTTP basic access 

    authentication mechanism (RFC 2617) and the given username and password 

    """ 

    request.headers['Authorization'] = basic_auth_header(username, password) 

 

def request_httprepr(request): 

    """Return the raw HTTP representation (as string) of the given request. 

    This is provided only for reference since it's not the actual stream of 

    bytes that will be send when performing the request (that's controlled 

    by Twisted). 

    """ 

    parsed = urlparse_cached(request) 

    path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, '')) 

    s  = "%s %s HTTP/1.1\r\n" % (request.method, path) 

    s += "Host: %s\r\n" % parsed.hostname 

    if request.headers: 

        s += request.headers.to_string() + "\r\n" 

    s += "\r\n" 

    s += request.body 

    return s 

 

def request_deferred(request): 

    """Wrap a request inside a Deferred. 

 

    This returns a Deferred whose first pair of callbacks are the request 

    callback and errback. The Deferred also triggers when the request 

    callback/errback is executed (ie. when the request is downloaded) 

    """ 

    d = Deferred() 

    if request.callback: 

        d.addCallbacks(request.callback, request.errback) 

    request.callback, request.errback = d.callback, d.errback 

    return d