[packages/python-lxml] Release 2. Patch for https://bugs.launchpad.net/lxml/+bug/1250557 added.
matkor
matkor at pld-linux.org
Mon Nov 25 13:27:43 CET 2013
commit 6ada93dfa9eef5fd90211395aef60ba663cc9421
Author: Mateusz Korniak <matkor at pld-linux.org>
Date: Mon Nov 25 13:27:17 2013 +0100
Release 2. Patch for https://bugs.launchpad.net/lxml/+bug/1250557 added.
...failures-option-to-make_links_absolute-to.patch | 144 +++++++++++++++++++++
python-lxml.spec | 4 +-
2 files changed, 147 insertions(+), 1 deletion(-)
---
diff --git a/python-lxml.spec b/python-lxml.spec
index 40f96e9..8aee4ca 100644
--- a/python-lxml.spec
+++ b/python-lxml.spec
@@ -9,11 +9,12 @@ Summary: Python 2 binding for the libxml2 and libxslt libraries
Summary(pl.UTF-8): Wiązanie Pythona 2 do bibliotek libxml2 i libxslt
Name: python-%{module}
Version: 3.2.4
-Release: 1
+Release: 2
License: BSD
Group: Libraries/Python
Source0: http://lxml.de/files/%{module}-%{version}.tgz
# Source0-md5: cc363499060f615aca1ec8dcc04df331
+Patch0: %{name}-add-handle_failures-option-to-make_links_absolute-to.patch
URL: http://lxml.de/
BuildRequires: libxml2-devel >= 1:2.7.8
BuildRequires: libxslt-devel >= 1.1.26
@@ -60,6 +61,7 @@ Dokumentacja API modułu lxml.
%prep
%setup -q -n %{module}-%{version}
+%patch0 -p1
%build
%if %{with python2}
diff --git a/python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch b/python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch
new file mode 100644
index 0000000..0c28250
--- /dev/null
+++ b/python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch
@@ -0,0 +1,144 @@
+From ab497930d74c7bcf4b725809508a1fefef453faa Mon Sep 17 00:00:00 2001
+From: Stefan Behnel <stefan_ml at behnel.de>
+Date: Fri, 15 Nov 2013 14:49:48 +0100
+Subject: [PATCH] add 'handle_failures' option to make_links_absolute() to
+ allow graceful handling of broken URLs
+
+---
+ CHANGES.txt | 4 +++
+ src/lxml/html/__init__.py | 49 +++++++++++++++++++++++++------
+ src/lxml/html/tests/test_rewritelinks.txt | 21 ++++++++++---
+ 3 files changed, 61 insertions(+), 13 deletions(-)
+
+diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
+index ea88d2b..dd52611 100644
+--- a/src/lxml/html/__init__.py
++++ b/src/lxml/html/__init__.py
+@@ -294,15 +294,21 @@ class HtmlMixin(object):
+ ## Link functions
+ ########################################
+
+- def make_links_absolute(self, base_url=None, resolve_base_href=True):
++ def make_links_absolute(self, base_url=None, resolve_base_href=True,
++ handle_failures=None):
+ """
+ Make all links in the document absolute, given the
+ ``base_url`` for the document (the full URL where the document
+- came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
++ came from), or if no ``base_url`` is given, then the ``.base_url``
++ of the document.
+
+ If ``resolve_base_href`` is true, then any ``<base href>``
+ tags in the document are used *and* removed from the document.
+ If it is false then any such tag is ignored.
++
++ If ``handle_failures`` is None (default), a failure to process
++ a URL will abort the processing. If set to 'ignore', errors
++ are ignored. If set to 'discard', failing URLs will be removed.
+ """
+ if base_url is None:
+ base_url = self.base_url
+@@ -311,24 +317,48 @@ class HtmlMixin(object):
+ "No base_url given, and the document has no base_url")
+ if resolve_base_href:
+ self.resolve_base_href()
+- def link_repl(href):
+- return urljoin(base_url, href)
++
++ if handle_failures == 'ignore':
++ def link_repl(href):
++ try:
++ return urljoin(base_url, href)
++ except ValueError:
++ return href
++ elif handle_failures == 'discard':
++ def link_repl(href):
++ try:
++ return urljoin(base_url, href)
++ except ValueError:
++ return None
++ elif handle_failures is None:
++ def link_repl(href):
++ return urljoin(base_url, href)
++ else:
++ raise ValueError(
++ "unexpected value for handle_failures: %r" % handle_failures)
++
+ self.rewrite_links(link_repl)
+
+- def resolve_base_href(self):
++ def resolve_base_href(self, handle_failures=None):
+ """
+ Find any ``<base href>`` tag in the document, and apply its
+ values to all links found in the document. Also remove the
+ tag once it has been applied.
++
++ If ``handle_failures`` is None (default), a failure to process
++ a URL will abort the processing. If set to 'ignore', errors
++ are ignored. If set to 'discard', failing URLs will be removed.
+ """
+ base_href = None
+- basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
++ basetags = self.xpath('//base[@href]|//x:base[@href]',
++ namespaces={'x': XHTML_NAMESPACE})
+ for b in basetags:
+ base_href = b.get('href')
+ b.drop_tree()
+ if not base_href:
+ return
+- self.make_links_absolute(base_href, resolve_base_href=False)
++ self.make_links_absolute(base_href, resolve_base_href=False,
++ handle_failures=handle_failures)
+
+ def iterlinks(self):
+ """
+@@ -434,6 +464,7 @@ class HtmlMixin(object):
+ base_href, resolve_base_href=resolve_base_href)
+ elif resolve_base_href:
+ self.resolve_base_href()
++
+ for el, attrib, link, pos in self.iterlinks():
+ new_link = link_repl_func(link.strip())
+ if new_link == link:
+diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt
+index 43dd99d..dd400b7 100644
+--- a/src/lxml/html/tests/test_rewritelinks.txt
++++ b/src/lxml/html/tests/test_rewritelinks.txt
+@@ -185,6 +185,22 @@ An application of ``iterlinks()`` is ``make_links_absolute()``::
+ </body>
+ </html>
+
++If the document contains invalid links, you may choose to "discard" or "ignore"
++them by passing the respective option into the ``handle_failures`` argument::
++
++ >>> html = lxml.html.fromstring ('''\
++ ... <html><body><div>
++ ... <a href="http://fancybase.com]Buy">test2</a>
++ ... </div></body></html>''')
++
++ >>> html.make_links_absolute(base_url="http://my.little.server/url/",
++ ... handle_failures="discard")
++
++ >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
++ <html><body><div>
++ <a>test2</a>
++ </div></body></html>
++
+ Check if we can replace multiple links inside of the same text string::
+
+ >>> html = lxml.html.fromstring ("""\
+@@ -209,10 +225,7 @@ Check if we can replace multiple links inside of the same text string::
+
+ >>> html.make_links_absolute ()
+
+- >>> try: _unicode = unicode
+- ... except NameError: _unicode = str
+-
+- >>> print(lxml.html.tostring (html, pretty_print = True, encoding=_unicode))
++ >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
+ <html>
+ <head>
+ <title>Test</title>
+--
+1.8.4.3
+
================================================================
---- gitweb:
http://git.pld-linux.org/gitweb.cgi/packages/python-lxml.git/commitdiff/6ada93dfa9eef5fd90211395aef60ba663cc9421
More information about the pld-cvs-commit
mailing list