summaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2017-07-12 14:51:46 +0200
committerlarryhastings <larry@hastings.org>2017-07-12 14:51:46 +0200
commitcc54c1c0d2d05fe7404ba64c53df4b1352ed2262 (patch)
treedf73846d6ca927314bdb24125bb902dbcfc942eb
parent71572bbe82aa0836c036d44d41c8269ba6a321be (diff)
downloadcpython-cc54c1c0d2d05fe7404ba64c53df4b1352ed2262.tar.gz
cpython-cc54c1c0d2d05fe7404ba64c53df4b1352ed2262.zip
bpo-30500: urllib: Simplify splithost by calling into urlparse. (#1849) (#2291)
The current regex based splitting produces a wrong result. For example:: http://abc#@def Web browsers parse that URL as ``http://abc/#@def``, that is, the host is ``abc``, the path is ``/``, and the fragment is ``#@def``. (cherry picked from commit 90e01e50ef8a9e6c91f30d965563c378a4ad26de)
-rw-r--r--Lib/test/test_urlparse.py51
-rw-r--r--Lib/urllib/parse.py2
-rw-r--r--Misc/ACKS1
-rw-r--r--Misc/NEWS.d/next/Security/2017-07-11-22-02-51.bpo-30500.wXUrkQ.rst4
4 files changed, 45 insertions, 13 deletions
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
index 1775ef33536..f087cedbe00 100644
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -681,28 +681,35 @@ class UrlParseTestCase(unittest.TestCase):
def test_parse_fragments(self):
# Exercise the allow_fragments parameter of urlparse() and urlsplit()
tests = (
- ("http:#frag", "path"),
- ("//example.net#frag", "path"),
- ("index.html#frag", "path"),
- (";a=b#frag", "params"),
- ("?a=b#frag", "query"),
- ("#frag", "path"),
+ ("http:#frag", "path", "frag"),
+ ("//example.net#frag", "path", "frag"),
+ ("index.html#frag", "path", "frag"),
+ (";a=b#frag", "params", "frag"),
+ ("?a=b#frag", "query", "frag"),
+ ("#frag", "path", "frag"),
+ ("abc#@frag", "path", "@frag"),
+ ("//abc#@frag", "path", "@frag"),
+ ("//abc:80#@frag", "path", "@frag"),
+ ("//abc#@frag:80", "path", "@frag:80"),
)
- for url, attr in tests:
+ for url, attr, expected_frag in tests:
for func in (urllib.parse.urlparse, urllib.parse.urlsplit):
if attr == "params" and func is urllib.parse.urlsplit:
attr = "path"
with self.subTest(url=url, function=func):
result = func(url, allow_fragments=False)
self.assertEqual(result.fragment, "")
- self.assertTrue(getattr(result, attr).endswith("#frag"))
+ self.assertTrue(
+ getattr(result, attr).endswith("#" + expected_frag))
self.assertEqual(func(url, "", False).fragment, "")
result = func(url, allow_fragments=True)
- self.assertEqual(result.fragment, "frag")
- self.assertFalse(getattr(result, attr).endswith("frag"))
- self.assertEqual(func(url, "", True).fragment, "frag")
- self.assertEqual(func(url).fragment, "frag")
+ self.assertEqual(result.fragment, expected_frag)
+ self.assertFalse(
+ getattr(result, attr).endswith(expected_frag))
+ self.assertEqual(func(url, "", True).fragment,
+ expected_frag)
+ self.assertEqual(func(url).fragment, expected_frag)
def test_mixed_types_rejected(self):
# Several functions that process either strings or ASCII encoded bytes
@@ -883,6 +890,26 @@ class Utility_Tests(unittest.TestCase):
self.assertEqual(splithost('/foo/bar/baz.html'),
(None, '/foo/bar/baz.html'))
+ # bpo-30500: # starts a fragment.
+ self.assertEqual(splithost('//127.0.0.1#@host.com'),
+ ('127.0.0.1', '/#@host.com'))
+ self.assertEqual(splithost('//127.0.0.1#@host.com:80'),
+ ('127.0.0.1', '/#@host.com:80'))
+ self.assertEqual(splithost('//127.0.0.1:80#@host.com'),
+ ('127.0.0.1:80', '/#@host.com'))
+
+ # Empty host is returned as empty string.
+ self.assertEqual(splithost("///file"),
+ ('', '/file'))
+
+ # Trailing semicolon, question mark and hash symbol are kept.
+ self.assertEqual(splithost("//example.net/file;"),
+ ('example.net', '/file;'))
+ self.assertEqual(splithost("//example.net/file?"),
+ ('example.net', '/file?'))
+ self.assertEqual(splithost("//example.net/file#"),
+ ('example.net', '/file#'))
+
def test_splituser(self):
splituser = urllib.parse.splituser
self.assertEqual(splituser('User:Pass@www.python.org:080'),
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index d36833111fb..d7b4e884632 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -865,7 +865,7 @@ def splithost(url):
"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
global _hostprog
if _hostprog is None:
- _hostprog = re.compile('^//([^/?]*)(.*)$')
+ _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
match = _hostprog.match(url)
if match:
diff --git a/Misc/ACKS b/Misc/ACKS
index bc188119b20..2e45f3d014b 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -994,6 +994,7 @@ Max Neunhöffer
Anthon van der Neut
George Neville-Neil
Hieu Nguyen
+Nam Nguyen
Johannes Nicolai
Samuel Nicolary
Jonathan Niehof
diff --git a/Misc/NEWS.d/next/Security/2017-07-11-22-02-51.bpo-30500.wXUrkQ.rst b/Misc/NEWS.d/next/Security/2017-07-11-22-02-51.bpo-30500.wXUrkQ.rst
new file mode 100644
index 00000000000..6570e709d6b
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2017-07-11-22-02-51.bpo-30500.wXUrkQ.rst
@@ -0,0 +1,4 @@
+Fix urllib.parse.splithost() to correctly parse fragments. For example,
+``splithost('//127.0.0.1#@evil.com/')`` now correctly returns the
+``127.0.0.1`` host, instead of treating ``@evil.com`` as the host in an
+authentification (``login@host``).