Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

import os 

from scrapy.utils.request import request_fingerprint 

from scrapy.utils.job import job_dir 

 

 

class BaseDupeFilter(object): 

 

    @classmethod 

    def from_settings(cls, settings): 

        return cls() 

 

    def request_seen(self, request): 

        return False 

 

    def open(self):  # can return deferred 

        pass 

 

    def close(self, reason): # can return a deferred 

        pass 

 

 

class RFPDupeFilter(BaseDupeFilter): 

    """Request Fingerprint duplicates filter""" 

 

    def __init__(self, path=None): 

        self.file = None 

        self.fingerprints = set() 

29        if path: 

            self.file = open(os.path.join(path, 'requests.seen'), 'a+') 

            self.fingerprints.update(x.rstrip() for x in self.file) 

 

    @classmethod 

    def from_settings(cls, settings): 

        return cls(job_dir(settings)) 

 

    def request_seen(self, request): 

        fp = request_fingerprint(request) 

        if fp in self.fingerprints: 

            return True 

        self.fingerprints.add(fp) 

42        if self.file: 

            self.file.write(fp + os.linesep) 

 

    def close(self, reason): 

46        if self.file: 

            self.file.close()