diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2017-07-12 14:51:46 +0200 |
---|---|---|
committer | larryhastings <larry@hastings.org> | 2017-07-12 14:51:46 +0200 |
commit | cc54c1c0d2d05fe7404ba64c53df4b1352ed2262 (patch) | |
tree | df73846d6ca927314bdb24125bb902dbcfc942eb | |
parent | 71572bbe82aa0836c036d44d41c8269ba6a321be (diff) | |
download | cpython-cc54c1c0d2d05fe7404ba64c53df4b1352ed2262.tar.gz cpython-cc54c1c0d2d05fe7404ba64c53df4b1352ed2262.zip |
bpo-30500: urllib: Simplify splithost by calling into urlparse. (#1849) (#2291)
The current regex based splitting produces a wrong result. For example::
http://abc#@def
Web browsers parse that URL as ``http://abc/#@def``, that is, the host
is ``abc``, the path is ``/``, and the fragment is ``#@def``.
(cherry picked from commit 90e01e50ef8a9e6c91f30d965563c378a4ad26de)
-rw-r--r-- | Lib/test/test_urlparse.py | 51 | ||||
-rw-r--r-- | Lib/urllib/parse.py | 2 | ||||
-rw-r--r-- | Misc/ACKS | 1 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Security/2017-07-11-22-02-51.bpo-30500.wXUrkQ.rst | 4 |
4 files changed, 45 insertions, 13 deletions
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 1775ef33536..f087cedbe00 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -681,28 +681,35 @@ class UrlParseTestCase(unittest.TestCase): def test_parse_fragments(self): # Exercise the allow_fragments parameter of urlparse() and urlsplit() tests = ( - ("http:#frag", "path"), - ("//example.net#frag", "path"), - ("index.html#frag", "path"), - (";a=b#frag", "params"), - ("?a=b#frag", "query"), - ("#frag", "path"), + ("http:#frag", "path", "frag"), + ("//example.net#frag", "path", "frag"), + ("index.html#frag", "path", "frag"), + (";a=b#frag", "params", "frag"), + ("?a=b#frag", "query", "frag"), + ("#frag", "path", "frag"), + ("abc#@frag", "path", "@frag"), + ("//abc#@frag", "path", "@frag"), + ("//abc:80#@frag", "path", "@frag"), + ("//abc#@frag:80", "path", "@frag:80"), ) - for url, attr in tests: + for url, attr, expected_frag in tests: for func in (urllib.parse.urlparse, urllib.parse.urlsplit): if attr == "params" and func is urllib.parse.urlsplit: attr = "path" with self.subTest(url=url, function=func): result = func(url, allow_fragments=False) self.assertEqual(result.fragment, "") - self.assertTrue(getattr(result, attr).endswith("#frag")) + self.assertTrue( + getattr(result, attr).endswith("#" + expected_frag)) self.assertEqual(func(url, "", False).fragment, "") result = func(url, allow_fragments=True) - self.assertEqual(result.fragment, "frag") - self.assertFalse(getattr(result, attr).endswith("frag")) - self.assertEqual(func(url, "", True).fragment, "frag") - self.assertEqual(func(url).fragment, "frag") + self.assertEqual(result.fragment, expected_frag) + self.assertFalse( + getattr(result, attr).endswith(expected_frag)) + self.assertEqual(func(url, "", True).fragment, + expected_frag) + self.assertEqual(func(url).fragment, expected_frag) def test_mixed_types_rejected(self): # Several functions that process either strings or ASCII encoded bytes @@ -883,6 +890,26 @@ class Utility_Tests(unittest.TestCase): self.assertEqual(splithost('/foo/bar/baz.html'), (None, '/foo/bar/baz.html')) + # bpo-30500: # starts a fragment. + self.assertEqual(splithost('//127.0.0.1#@host.com'), + ('127.0.0.1', '/#@host.com')) + self.assertEqual(splithost('//127.0.0.1#@host.com:80'), + ('127.0.0.1', '/#@host.com:80')) + self.assertEqual(splithost('//127.0.0.1:80#@host.com'), + ('127.0.0.1:80', '/#@host.com')) + + # Empty host is returned as empty string. + self.assertEqual(splithost("///file"), + ('', '/file')) + + # Trailing semicolon, question mark and hash symbol are kept. + self.assertEqual(splithost("//example.net/file;"), + ('example.net', '/file;')) + self.assertEqual(splithost("//example.net/file?"), + ('example.net', '/file?')) + self.assertEqual(splithost("//example.net/file#"), + ('example.net', '/file#')) + def test_splituser(self): splituser = urllib.parse.splituser self.assertEqual(splituser('User:Pass@www.python.org:080'), diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index d36833111fb..d7b4e884632 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -865,7 +865,7 @@ def splithost(url): """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" global _hostprog if _hostprog is None: - _hostprog = re.compile('^//([^/?]*)(.*)$') + _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL) match = _hostprog.match(url) if match: diff --git a/Misc/ACKS b/Misc/ACKS index bc188119b20..2e45f3d014b 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -994,6 +994,7 @@ Max Neunhöffer Anthon van der Neut George Neville-Neil Hieu Nguyen +Nam Nguyen Johannes Nicolai Samuel Nicolary Jonathan Niehof diff --git a/Misc/NEWS.d/next/Security/2017-07-11-22-02-51.bpo-30500.wXUrkQ.rst b/Misc/NEWS.d/next/Security/2017-07-11-22-02-51.bpo-30500.wXUrkQ.rst new file mode 100644 index 00000000000..6570e709d6b --- /dev/null +++ b/Misc/NEWS.d/next/Security/2017-07-11-22-02-51.bpo-30500.wXUrkQ.rst @@ -0,0 +1,4 @@ +Fix urllib.parse.splithost() to correctly parse fragments. For example, +``splithost('//127.0.0.1#@evil.com/')`` now correctly returns the +``127.0.0.1`` host, instead of treating ``@evil.com`` as the host in an +authentification (``login@host``). |