Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

from scrapy import optional_features 

from scrapy.exceptions import NotConfigured 

from scrapy.utils.httpobj import urlparse_cached 

from scrapy.conf import settings 

from .http import HttpDownloadHandler 

 

try: 

    from boto.s3.connection import S3Connection 

except ImportError: 

    S3Connection = object 

 

class _v19_S3Connection(S3Connection): 

    """A dummy S3Connection wrapper that doesn't do any syncronous download""" 

    def _mexe(self, method, bucket, key, headers, *args, **kwargs): 

        return headers 

 

class _v20_S3Connection(S3Connection): 

    """A dummy S3Connection wrapper that doesn't do any syncronous download""" 

    def _mexe(self, http_request, *args): 

        http_request.authorize(connection=self) 

        return http_request.headers 

 

try: 

    import boto.auth 

except ImportError: 

    _S3Connection = _v19_S3Connection 

else: 

    _S3Connection = _v20_S3Connection 

 

 

class S3DownloadHandler(object): 

 

    def __init__(self, aws_access_key_id=None, aws_secret_access_key=None, \ 

            httpdownloadhandler=HttpDownloadHandler): 

36        if 'boto' not in optional_features: 

            raise NotConfigured("missing boto library") 

 

        if not aws_access_key_id: 

            aws_access_key_id = settings['AWS_ACCESS_KEY_ID'] 

        if not aws_secret_access_key: 

            aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY'] 

 

        try: 

            self.conn = _S3Connection(aws_access_key_id, aws_secret_access_key) 

        except Exception, ex: 

            raise NotConfigured(str(ex)) 

        self._download_http = httpdownloadhandler().download_request 

 

    def download_request(self, request, spider): 

        p = urlparse_cached(request) 

        scheme = 'https' if request.meta.get('is_secure') else 'http' 

        bucket = p.hostname 

        path = p.path + '?' + p.query if p.query else p.path 

        url = '%s://%s.s3.amazonaws.com%s' % (scheme, bucket, path) 

        signed_headers = self.conn.make_request( 

                method=request.method, 

                bucket=bucket, 

                key=p.path, 

                query_args=p.query, 

                headers=request.headers, 

                data=request.body) 

        httpreq = request.replace(url=url, headers=signed_headers) 

        return self._download_http(httpreq, spider)