Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

""" 

Images Pipeline 

 

See documentation in topics/images.rst 

""" 

 

import os 

import time 

import hashlib 

import urlparse 

import rfc822 

import Image 

from cStringIO import StringIO 

from collections import defaultdict 

 

from twisted.internet import defer, threads 

 

from scrapy.xlib.pydispatch import dispatcher 

from scrapy import log 

from scrapy.stats import stats 

from scrapy.utils.misc import md5sum 

from scrapy.http import Request 

from scrapy import signals 

from scrapy.exceptions import DropItem, NotConfigured, IgnoreRequest 

from scrapy.contrib.pipeline.media import MediaPipeline 

 

 

class NoimagesDrop(DropItem): 

    """Product with no images exception""" 

 

class ImageException(Exception): 

    """General image error exception""" 

 

 

class FSImagesStore(object): 

 

    def __init__(self, basedir): 

39        if '://' in basedir: 

            basedir = basedir.split('://', 1)[1] 

        self.basedir = basedir 

        self._mkdir(self.basedir) 

        self.created_directories = defaultdict(set) 

        dispatcher.connect(self.spider_closed, signals.spider_closed) 

 

    def spider_closed(self, spider): 

        self.created_directories.pop(spider.name, None) 

 

    def persist_image(self, key, image, buf, info): 

        absolute_path = self._get_filesystem_path(key) 

        self._mkdir(os.path.dirname(absolute_path), info) 

        image.save(absolute_path) 

 

    def stat_image(self, key, info): 

        absolute_path = self._get_filesystem_path(key) 

        try: 

            last_modified = os.path.getmtime(absolute_path) 

        except: # FIXME: catching everything! 

            return {} 

 

        with open(absolute_path, 'rb') as imagefile: 

            checksum = md5sum(imagefile) 

 

        return {'last_modified': last_modified, 'checksum': checksum} 

 

    def _get_filesystem_path(self, key): 

        path_comps = key.split('/') 

        return os.path.join(self.basedir, *path_comps) 

 

    def _mkdir(self, dirname, domain=None): 

        seen = self.created_directories[domain] if domain else set() 

exit        if dirname not in seen: 

73            if not os.path.exists(dirname): 

                os.makedirs(dirname) 

            seen.add(dirname) 

 

 

class S3ImagesStore(object): 

 

    AWS_ACCESS_KEY_ID = None 

    AWS_SECRET_ACCESS_KEY = None 

 

    POLICY = 'public-read' 

    HEADERS = { 

            'Cache-Control': 'max-age=172800', 

            'Content-Type': 'image/jpeg', 

            } 

 

    def __init__(self, uri): 

        assert uri.startswith('s3://') 

        self.bucket, self.prefix = uri[5:].split('/', 1) 

 

    def stat_image(self, key, info): 

        def _onsuccess(boto_key): 

            checksum = boto_key.etag.strip('"') 

            last_modified = boto_key.last_modified 

            modified_tuple = rfc822.parsedate_tz(last_modified) 

            modified_stamp = int(rfc822.mktime_tz(modified_tuple)) 

            return {'checksum': checksum, 'last_modified': modified_stamp} 

 

        return self._get_boto_key(key).addCallback(_onsuccess) 

 

    def _get_boto_bucket(self): 

        from boto.s3.connection import S3Connection 

        # disable ssl (is_secure=False) because of this python bug: 

        # http://bugs.python.org/issue5103 

        c = S3Connection(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY, is_secure=False) 

        return c.get_bucket(self.bucket, validate=False) 

 

    def _get_boto_key(self, key): 

        b = self._get_boto_bucket() 

        key_name = '%s%s' % (self.prefix, key) 

        return threads.deferToThread(b.get_key, key_name) 

 

    def persist_image(self, key, image, buf, info): 

        """Upload image to S3 storage""" 

        width, height = image.size 

        b = self._get_boto_bucket() 

        key_name = '%s%s' % (self.prefix, key) 

        k = b.new_key(key_name) 

        k.set_metadata('width', str(width)) 

        k.set_metadata('height', str(height)) 

        buf.seek(0) 

        return threads.deferToThread(k.set_contents_from_file, buf, \ 

                headers=self.HEADERS, policy=self.POLICY) 

 

 

class ImagesPipeline(MediaPipeline): 

    """Abstract pipeline that implement the image downloading and thumbnail generation logic 

 

    This pipeline tries to minimize network transfers and image processing, 

    doing stat of the images and determining if image is new, uptodate or 

    expired. 

 

    `new` images are those that pipeline never processed and needs to be 

        downloaded from supplier site the first time. 

 

    `uptodate` images are the ones that the pipeline processed and are still 

        valid images. 

 

    `expired` images are those that pipeline already processed but the last 

        modification was made long time ago, so a reprocessing is recommended to 

        refresh it in case of change. 

 

    """ 

 

    MEDIA_NAME = 'image' 

    MIN_WIDTH = 0 

    MIN_HEIGHT = 0 

    EXPIRES = 90 

    THUMBS = {} 

    STORE_SCHEMES = { 

            '': FSImagesStore, 

            'file': FSImagesStore, 

            's3': S3ImagesStore, 

            } 

 

    def __init__(self, store_uri, download_func=None): 

159        if not store_uri: 

            raise NotConfigured 

        self.store = self._get_store(store_uri) 

        super(ImagesPipeline, self).__init__(download_func=download_func) 

 

    @classmethod 

    def from_settings(cls, settings): 

        cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0) 

        cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0) 

        cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90) 

        cls.THUMBS = settings.get('IMAGES_THUMBS', {}) 

        s3store = cls.STORE_SCHEMES['s3'] 

        s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID'] 

        s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY'] 

        store_uri = settings['IMAGES_STORE'] 

        return cls(store_uri) 

 

    def _get_store(self, uri): 

179        if os.path.isabs(uri): # to support win32 paths like: C:\\some\dir 

            scheme = 'file' 

        else: 

            scheme = urlparse.urlparse(uri).scheme 

        store_cls = self.STORE_SCHEMES[scheme] 

        return store_cls(uri) 

 

    def media_downloaded(self, response, request, info): 

        referer = request.headers.get('Referer') 

 

        if response.status != 200: 

            log.msg('Image (code: %s): Error downloading image from %s referred in <%s>' \ 

                    % (response.status, request, referer), level=log.WARNING, spider=info.spider) 

            raise ImageException 

 

        if not response.body: 

            log.msg('Image (empty-content): Empty image from %s referred in <%s>: no-content' \ 

                    % (request, referer), level=log.WARNING, spider=info.spider) 

            raise ImageException 

 

        status = 'cached' if 'cached' in response.flags else 'downloaded' 

        msg = 'Image (%s): Downloaded image from %s referred in <%s>' % \ 

                (status, request, referer) 

        log.msg(msg, level=log.DEBUG, spider=info.spider) 

        self.inc_stats(info.spider, status) 

 

        try: 

            key = self.image_key(request.url) 

            checksum = self.image_downloaded(response, request, info) 

        except ImageException, ex: 

            log.msg(str(ex), level=log.WARNING, spider=info.spider) 

            raise 

        except Exception: 

            log.err(spider=info.spider) 

            raise ImageException 

 

        return {'url': request.url, 'path': key, 'checksum': checksum} 

 

    def media_failed(self, failure, request, info): 

        if not isinstance(failure.value, IgnoreRequest): 

            referer = request.headers.get('Referer') 

            msg = 'Image (unknown-error): Error downloading %s from %s referred in <%s>: %s' \ 

                    % (self.MEDIA_NAME, request, referer, str(failure)) 

            log.msg(msg, level=log.WARNING, spider=info.spider) 

        raise ImageException 

 

    def media_to_download(self, request, info): 

        def _onsuccess(result): 

            if not result: 

                return # returning None force download 

 

            last_modified = result.get('last_modified', None) 

            if not last_modified: 

                return # returning None force download 

 

            age_seconds = time.time() - last_modified 

            age_days = age_seconds / 60 / 60 / 24 

            if age_days > self.EXPIRES: 

                return # returning None force download 

 

            referer = request.headers.get('Referer') 

            log.msg('Image (uptodate): Downloaded %s from <%s> referred in <%s>' % \ 

                    (self.MEDIA_NAME, request.url, referer), level=log.DEBUG, spider=info.spider) 

            self.inc_stats(info.spider, 'uptodate') 

 

            checksum = result.get('checksum', None) 

            return {'url': request.url, 'path': key, 'checksum': checksum} 

 

        key = self.image_key(request.url) 

        dfd = defer.maybeDeferred(self.store.stat_image, key, info) 

        dfd.addCallbacks(_onsuccess, lambda _:None) 

        dfd.addErrback(log.err, self.__class__.__name__ + '.store.stat_image') 

        return dfd 

 

    def image_downloaded(self, response, request, info): 

        checksum = None 

        for key, image, buf in self.get_images(response, request, info): 

            if checksum is None: 

                buf.seek(0) 

                checksum = md5sum(buf) 

            self.store.persist_image(key, image, buf, info) 

        return checksum 

 

    def get_images(self, response, request, info): 

        key = self.image_key(request.url) 

        orig_image = Image.open(StringIO(response.body)) 

 

        width, height = orig_image.size 

        if width < self.MIN_WIDTH or height < self.MIN_HEIGHT: 

            raise ImageException("Image too small (%dx%d < %dx%d): %s" % \ 

                    (width, height, self.MIN_WIDTH, self.MIN_HEIGHT, response.url)) 

 

        image, buf = self.convert_image(orig_image) 

        yield key, image, buf 

 

        for thumb_id, size in self.THUMBS.iteritems(): 

            thumb_key = self.thumb_key(request.url, thumb_id) 

            thumb_image, thumb_buf = self.convert_image(image, size) 

            yield thumb_key, thumb_image, thumb_buf 

 

    def inc_stats(self, spider, status): 

        stats.inc_value('image_count', spider=spider) 

        stats.inc_value('image_status_count/%s' % status, spider=spider) 

 

    def convert_image(self, image, size=None): 

        if image.format == 'PNG' and image.mode == 'RGBA': 

            background = Image.new('RGBA', image.size, (255, 255, 255)) 

            background.paste(image, image) 

            image = background.convert('RGB') 

286        elif image.mode != 'RGB': 

            image = image.convert('RGB') 

 

        if size: 

            image = image.copy() 

            image.thumbnail(size, Image.ANTIALIAS) 

 

        buf = StringIO() 

        try: 

            image.save(buf, 'JPEG') 

        except Exception, ex: 

            raise ImageException("Cannot process image. Error: %s" % ex) 

 

        return image, buf 

 

    def image_key(self, url): 

        image_guid = hashlib.sha1(url).hexdigest() 

        return 'full/%s.jpg' % (image_guid) 

 

    def thumb_key(self, url, thumb_id): 

        image_guid = hashlib.sha1(url).hexdigest() 

        return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid) 

 

    def get_media_requests(self, item, info): 

        return [Request(x) for x in item.get('image_urls', [])] 

 

    def item_completed(self, results, item, info): 

        item['images'] = [x for ok, x in results if ok] 

        return item