Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

"""Helper functions which doesn't fit anywhere else""" 

 

import re 

import hashlib 

from pkgutil import iter_modules 

 

from w3lib.html import remove_entities 

from scrapy.utils.python import flatten 

 

def arg_to_iter(arg): 

    """Convert an argument to an iterable. The argument can be a None, single 

    value, or an iterable. 

 

    Exception: if arg is a dict, [arg] will be returned 

    """ 

    if arg is None: 

        return [] 

    elif not isinstance(arg, dict) and hasattr(arg, '__iter__'): 

        return arg 

    else: 

        return [arg] 

 

def load_object(path): 

    """Load an object given its absolute object path, and return it. 

 

    object can be a class, function, variable o instance. 

    path ie: 'scrapy.contrib.downloadermiddelware.redirect.RedirectMiddleware' 

    """ 

 

    try: 

        dot = path.rindex('.') 

    except ValueError: 

        raise ValueError, "Error loading object '%s': not a full path" % path 

 

    module, name = path[:dot], path[dot+1:] 

    try: 

        mod = __import__(module, {}, {}, ['']) 

    except ImportError, e: 

        raise ImportError, "Error loading object '%s': %s" % (path, e) 

 

    try: 

        obj = getattr(mod, name) 

    except AttributeError: 

        raise NameError, "Module '%s' doesn't define any object named '%s'" % (module, name) 

 

    return obj 

 

def walk_modules(path, load=False): 

    """Loads a module and all its submodules from a the given module path and 

    returns them. If *any* module throws an exception while importing, that 

    exception is thrown back. 

 

    For example: walk_modules('scrapy.utils') 

    """ 

 

    mods = [] 

    mod = __import__(path, {}, {}, ['']) 

    mods.append(mod) 

    if hasattr(mod, '__path__'): 

        for _, subpath, ispkg in iter_modules(mod.__path__): 

            fullpath = path + '.' + subpath 

            if ispkg: 

                mods += walk_modules(fullpath) 

            else: 

                submod = __import__(fullpath, {}, {}, ['']) 

                mods.append(submod) 

    return mods 

 

def extract_regex(regex, text, encoding='utf-8'): 

    """Extract a list of unicode strings from the given text/encoding using the following policies: 

 

    * if the regex contains a named group called "extract" that will be returned 

    * if the regex contains multiple numbered groups, all those will be returned (flattened) 

    * if the regex doesn't contain any group the entire regex matching is returned 

    """ 

 

    if isinstance(regex, basestring): 

        regex = re.compile(regex, re.UNICODE) 

 

    try: 

        strings = [regex.search(text).group('extract')]   # named group 

    except: 

        strings = regex.findall(text)    # full regex or numbered groups 

    strings = flatten(strings) 

 

89    if isinstance(text, unicode): 

        return [remove_entities(s, keep=['lt', 'amp']) for s in strings] 

    else: 

        return [remove_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings] 

 

def md5sum(file): 

    """Calculate the md5 checksum of a file-like object without reading its 

    whole content in memory. 

 

    >>> from StringIO import StringIO 

    >>> md5sum(StringIO('file content to hash')) 

    '784406af91dd5a54fbb9c84c2236595a' 

    """ 

    m = hashlib.md5() 

    while 1: 

        d = file.read(8096) 

        if not d: 

            break 

        m.update(d) 

    return m.hexdigest()