Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

""" 

This module provides some useful functions for working with 

scrapy.http.Response objects 

""" 

 

import os 

import re 

import weakref 

import webbrowser 

import tempfile 

 

from twisted.web import http 

from twisted.web.http import RESPONSES 

from w3lib import html 

 

from scrapy.http import Response, HtmlResponse 

 

def body_or_str(obj, unicode=True): 

    assert isinstance(obj, (Response, basestring)), \ 

        "obj must be Response or basestring, not %s" % type(obj).__name__ 

    if isinstance(obj, Response): 

        return obj.body_as_unicode() if unicode else obj.body 

    elif isinstance(obj, str): 

        return obj.decode('utf-8') if unicode else obj 

    else: 

        return obj if unicode else obj.encode('utf-8') 

 

_baseurl_cache = weakref.WeakKeyDictionary() 

def get_base_url(response): 

    """Return the base url of the given response, joined with the response url""" 

35    if response not in _baseurl_cache: 

        text = response.body_as_unicode()[0:4096] 

        _baseurl_cache[response] = html.get_base_url(text, response.url, \ 

            response.encoding) 

    return _baseurl_cache[response] 

 

_noscript_re = re.compile(u'<noscript>.*?</noscript>', re.IGNORECASE | re.DOTALL) 

_script_re = re.compile(u'<script.*?>.*?</script>', re.IGNORECASE | re.DOTALL) 

_metaref_cache = weakref.WeakKeyDictionary() 

def get_meta_refresh(response): 

    """Parse the http-equiv refrsh parameter from the given response""" 

48    if response not in _metaref_cache: 

        text = response.body_as_unicode()[0:4096] 

        text = _noscript_re.sub(u'', text) 

        text = _script_re.sub(u'', text) 

        _metaref_cache[response] = html.get_meta_refresh(text, response.url, \ 

            response.encoding) 

    return _metaref_cache[response] 

 

def response_status_message(status): 

    """Return status code plus status text descriptive message 

 

    >>> response_status_message(200) 

    '200 OK' 

 

    >>> response_status_message(404) 

    '404 Not Found' 

    """ 

    return '%s %s' % (status, http.responses.get(int(status))) 

 

def response_httprepr(response): 

    """Return raw HTTP representation (as string) of the given response. This 

    is provided only for reference, since it's not the exact stream of bytes 

    that was received (that's not exposed by Twisted). 

    """ 

 

    s = "HTTP/1.1 %d %s\r\n" % (response.status, RESPONSES.get(response.status, '')) 

    if response.headers: 

        s += response.headers.to_string() + "\r\n" 

    s += "\r\n" 

    s += response.body 

    return s 

 

def open_in_browser(response, _openfunc=webbrowser.open): 

    """Open the given response in a local web browser, populating the <base> 

    tag for external links to work 

    """ 

    # XXX: this implementation is a bit dirty and could be improved 

80    if not isinstance(response, HtmlResponse): 

        raise TypeError("Unsupported response type: %s" % \ 

            response.__class__.__name__) 

    body = response.body 

85    if '<base' not in body: 

        body = body.replace('<head>', '<head><base href="%s">' % response.url) 

    fd, fname = tempfile.mkstemp('.html') 

    os.write(fd, body) 

    os.close(fd) 

    return _openfunc("file://%s" % fname)