URL Opener section¶
An URL opener source section requests a URL and inserts keys for the response and header optionally also using a local cache.
>>> urlopener = """
... [transmogrifier]
... pipeline =
... source
... url
... urlopen
... headers
... logger
...
... [source]
... blueprint = collective.transmogrifier.sections.csvsource
... filename = collective.transmogrifier.tests:urlopener.csv
...
... [url]
... blueprint = collective.transmogrifier.sections.inserter
... key = string:_url
... condition = python:not modules['urllib.parse'].urlsplit(
... item.get('_url', '')).netloc
... value = python:'file://' + modules['posixpath'].join(
... modules['os.path'].dirname(
... modules['collective.transmogrifier'].__file__), item['_url'])
...
... [urlopen]
... blueprint = collective.transmogrifier.sections.urlopener
... handlers = python:[modules[
... 'collective.transmogrifier.sections.tests'].HTTPHandler]
... ignore-error = python:error.code == 404
... cache-directory = var/tests.urlopener.cache.d
...
... [headers]
... blueprint = collective.transmogrifier.sections.inserter
... key = string:_headers
... condition = exists:item/_headers
... value = python:dict(item['_headers'])
...
... [logger]
... blueprint = collective.transmogrifier.sections.logger
... name = logger
... level = INFO
... """
>>> registerConfig(
... 'collective.transmogrifier.sections.tests.urlopener', urlopener)
>>> transmogrifier('collective.transmogrifier.sections.tests.urlopener')
>>> print(handler)
logger INFO
{'_cache': 'var/tests.urlopener.cache.d/http/foo/bar/qux/non-existent.html',
'_headers': {'status': '404 Not Found',
'url': 'http://foo/bar/qux/non-existent.html'},
'_url': 'http://foo/bar/qux/non-existent.html'}
logger INFO
{'_cache': 'var/tests.urlopener.cache.d/http/foo/bar/qux/redirect.html',
'_headers': {'redirect-status': '301 Permanent',
'status': '200 Ok',
'url': 'http://foo/bar/qux/location.html'},
'_url': 'http://foo/bar/qux/redirect.html'}
The cache directory has had response bodies written as files and headers as RFC822 messages.
>>> import os
>>> import pprint
>>> pprint.pprint(sorted(list(
... (x[0], sorted(x[1]), sorted(x[2]))
... for x in os.walk('var/tests.urlopener.cache.d')
... ), key= lambda x: x[0]))
[('var/tests.urlopener.cache.d', ['http'], []),
('var/tests.urlopener.cache.d/http', ['foo'], []),
('var/tests.urlopener.cache.d/http/foo', ['bar'], []),
('var/tests.urlopener.cache.d/http/foo/bar', ['qux'], []),
('var/tests.urlopener.cache.d/http/foo/bar/qux',
[],
['non-existent.html',
'non-existent.html...',
'redirect.html',
'redirect.html...'])]