Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

from twisted.internet import task 

 

from scrapy.xlib.pydispatch import dispatcher 

from scrapy.exceptions import NotConfigured 

from scrapy.conf import settings 

from scrapy import log, signals 

 

class Slot(object): 

 

    def __init__(self): 

        self.items = 0 

        self.itemsprev = 0 

        self.pages = 0 

        self.pagesprev = 0 

 

class LogStats(object): 

    """Log basic scraping stats periodically""" 

 

    def __init__(self): 

        self.interval = settings.getfloat('LOGSTATS_INTERVAL') 

22        if not self.interval: 

            raise NotConfigured 

        self.slots = {} 

        self.multiplier = 60.0 / self.interval 

        dispatcher.connect(self.item_scraped, signal=signals.item_scraped) 

        dispatcher.connect(self.response_received, signal=signals.response_received) 

        dispatcher.connect(self.spider_opened, signal=signals.spider_opened) 

        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 

        dispatcher.connect(self.engine_started, signal=signals.engine_started) 

        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped) 

 

    def item_scraped(self, spider): 

        self.slots[spider].items += 1 

 

    def response_received(self, spider): 

        self.slots[spider].pages += 1 

 

    def spider_opened(self, spider): 

        self.slots[spider] = Slot() 

 

    def spider_closed(self, spider): 

        del self.slots[spider] 

 

    def engine_started(self): 

        self.tsk = task.LoopingCall(self.log) 

        self.tsk.start(self.interval) 

 

    def log(self): 

        for spider, slot in self.slots.items(): 

            irate = (slot.items - slot.itemsprev) * self.multiplier 

            prate = (slot.pages - slot.pagesprev) * self.multiplier 

            slot.pagesprev, slot.itemsprev = slot.pages, slot.items 

            msg = "Crawled %d pages (at %d pages/min), scraped %d items (at %d items/min)" \ 

                % (slot.pages, prate, slot.items, irate) 

            log.msg(msg, spider=spider) 

 

    def engine_stopped(self): 

exit        if self.tsk.running: 

            self.tsk.stop()