3 files changed, 180 insertions, 19 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 4d7fcec94b1..99a6977ebff 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -156,9 +156,8 @@ class _NetlocResultMixinBase(object):
         port = self._hostinfo[1]
         if port is not None:
             port = int(port, 10)
-            # Return None on an illegal port
             if not ( 0 <= port <= 65535):
-                return None
+                raise ValueError("Port out of range 0-65535")
         return port
 
 
@@ -225,8 +224,71 @@ class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
 from collections import namedtuple
 
 _DefragResultBase = namedtuple('DefragResult', 'url fragment')
-_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
-_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
+_SplitResultBase = namedtuple(
+    'SplitResult', 'scheme netloc path query fragment')
+_ParseResultBase = namedtuple(
+    'ParseResult', 'scheme netloc path params query fragment')
+
+_DefragResultBase.__doc__ = """
+DefragResult(url, fragment)
+
+A 2-tuple that contains the url without fragment identifier and the fragment
+identifier as a separate argument.
+"""
+
+_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
+
+_DefragResultBase.fragment.__doc__ = """
+Fragment identifier separated from URL, that allows indirect identification of a
+secondary resource by reference to a primary resource and additional identifying
+information.
+"""
+
+_SplitResultBase.__doc__ = """
+SplitResult(scheme, netloc, path, query, fragment)
+
+A 5-tuple that contains the different components of a URL. Similar to
+ParseResult, but does not split params.
+"""
+
+_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
+
+_SplitResultBase.netloc.__doc__ = """
+Network location where the request is made to.
+"""
+
+_SplitResultBase.path.__doc__ = """
+The hierarchical path, such as the path to a file to download.
+"""
+
+_SplitResultBase.query.__doc__ = """
+The query component, that contains non-hierarchical data, that along with data
+in path component, identifies a resource in the scope of URI's scheme and
+network location.
+"""
+
+_SplitResultBase.fragment.__doc__ = """
+Fragment identifier, that allows indirect identification of a secondary resource
+by reference to a primary resource and additional identifying information.
+"""
+
+_ParseResultBase.__doc__ = """
+ParseResult(scheme, netloc, path, params,  query, fragment)
+
+A 6-tuple that contains components of a parsed URL.
+"""
+
+_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
+_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
+_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
+_ParseResultBase.params.__doc__ = """
+Parameters for last path element used to dereference the URI in order to provide
+access to perform some operation on the resource.
+"""
+
+_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
+_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
+
 
 # For backwards compatibility, alias _NetlocResultMixinStr
 # ResultBase is no longer part of the documented API, but it is
diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py
index a4bf97dcd38..5f15b74f4de 100644
--- a/Lib/urllib/request.py
+++ b/Lib/urllib/request.py
@@ -134,13 +134,73 @@ __all__ = [
 ]
 
 # used in User-Agent header sent
-__version__ = sys.version[:3]
+__version__ = '%d.%d' % sys.version_info[:2]
 
 _opener = None
 def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
             *, cafile=None, capath=None, cadefault=False, context=None):
+    '''Open the URL url, which can be either a string or a Request object.
+
+    *data* must be an object specifying additional data to be sent to
+    the server, or None if no such data is needed.  See Request for
+    details.
+
+    urllib.request module uses HTTP/1.1 and includes a "Connection:close"
+    header in its HTTP requests.
+
+    The optional *timeout* parameter specifies a timeout in seconds for
+    blocking operations like the connection attempt (if not specified, the
+    global default timeout setting will be used). This only works for HTTP,
+    HTTPS and FTP connections.
+
+    If *context* is specified, it must be a ssl.SSLContext instance describing
+    the various SSL options. See HTTPSConnection for more details.
+
+    The optional *cafile* and *capath* parameters specify a set of trusted CA
+    certificates for HTTPS requests. cafile should point to a single file
+    containing a bundle of CA certificates, whereas capath should point to a
+    directory of hashed certificate files. More information can be found in
+    ssl.SSLContext.load_verify_locations().
+
+    The *cadefault* parameter is ignored.
+
+    This function always returns an object which can work as a context
+    manager and has methods such as
+
+    * geturl() - return the URL of the resource retrieved, commonly used to
+      determine if a redirect was followed
+
+    * info() - return the meta-information of the page, such as headers, in the
+      form of an email.message_from_string() instance (see Quick Reference to
+      HTTP Headers)
+
+    * getcode() - return the HTTP status code of the response.  Raises URLError
+      on errors.
+
+    For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
+    object slightly modified. In addition to the three new methods above, the
+    msg attribute contains the same information as the reason attribute ---
+    the reason phrase returned by the server --- instead of the response
+    headers as it is specified in the documentation for HTTPResponse.
+
+    For FTP, file, and data URLs and requests explicitly handled by legacy
+    URLopener and FancyURLopener classes, this function returns a
+    urllib.response.addinfourl object.
+
+    Note that None may be returned if no handler handles the request (though
+    the default installed global OpenerDirector uses UnknownHandler to ensure
+    this never happens).
+
+    In addition, if proxy settings are detected (for example, when a *_proxy
+    environment variable like http_proxy is set), ProxyHandler is default
+    installed and makes sure the requests are handled through the proxy.
+
+    '''
     global _opener
     if cafile or capath or cadefault:
+        import warnings
+        warnings.warn("cafile, cpath and cadefault are deprecated, use a "
+                      "custom context instead.", DeprecationWarning, 2)
         if context is not None:
             raise ValueError(
                 "You can't pass both context and any of cafile, capath, and "
@@ -1170,6 +1230,11 @@ class AbstractHTTPHandler(BaseHandler):
     def set_http_debuglevel(self, level):
         self._debuglevel = level
 
+    def _get_content_length(self, request):
+        return http.client.HTTPConnection._get_content_length(
+            request.data,
+            request.get_method())
+
     def do_request_(self, request):
         host = request.host
         if not host:
@@ -1178,24 +1243,22 @@ class AbstractHTTPHandler(BaseHandler):
         if request.data is not None:  # POST
             data = request.data
             if isinstance(data, str):
-                msg = "POST data should be bytes or an iterable of bytes. " \
-                      "It cannot be of type str."
+                msg = "POST data should be bytes, an iterable of bytes, " \
+                      "or a file object. It cannot be of type str."
                 raise TypeError(msg)
             if not request.has_header('Content-type'):
                 request.add_unredirected_header(
                     'Content-type',
                     'application/x-www-form-urlencoded')
-            if not request.has_header('Content-length'):
-                try:
-                    mv = memoryview(data)
-                except TypeError:
-                    if isinstance(data, collections.Iterable):
-                        raise ValueError("Content-Length should be specified "
-                                "for iterable data of type %r %r" % (type(data),
-                                data))
+            if (not request.has_header('Content-length')
+                    and not request.has_header('Transfer-encoding')):
+                content_length = self._get_content_length(request)
+                if content_length is not None:
+                    request.add_unredirected_header(
+                            'Content-length', str(content_length))
                 else:
                     request.add_unredirected_header(
-                            'Content-length', '%d' % (len(mv) * mv.itemsize))
+                            'Transfer-encoding', 'chunked')
 
         sel_host = host
         if request.has_proxy():
@@ -1251,7 +1314,8 @@ class AbstractHTTPHandler(BaseHandler):
 
         try:
             try:
-                h.request(req.get_method(), req.selector, req.data, headers)
+                h.request(req.get_method(), req.selector, req.data, headers,
+                          encode_chunked=req.has_header('Transfer-encoding'))
             except OSError as err: # timeout error
                 raise URLError(err)
             r = h.getresponse()
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 8b69fd985ee..85add1624ae 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -10,7 +10,9 @@
     http://www.robotstxt.org/norobots-rfc.txt
 """
 
-import urllib.parse, urllib.request
+import collections
+import urllib.parse
+import urllib.request
 
 __all__ = ["RobotFileParser"]
 
@@ -120,10 +122,29 @@ class RobotFileParser:
                     if state != 0:
                         entry.rulelines.append(RuleLine(line[1], True))
                         state = 2
+                elif line[0] == "crawl-delay":
+                    if state != 0:
+                        # before trying to convert to int we need to make
+                        # sure that robots.txt has valid syntax otherwise
+                        # it will crash
+                        if line[1].strip().isdigit():
+                            entry.delay = int(line[1])
+                        state = 2
+                elif line[0] == "request-rate":
+                    if state != 0:
+                        numbers = line[1].split('/')
+                        # check if all values are sane
+                        if (len(numbers) == 2 and numbers[0].strip().isdigit()
+                            and numbers[1].strip().isdigit()):
+                            req_rate = collections.namedtuple('req_rate',
+                                                              'requests seconds')
+                            entry.req_rate = req_rate
+                            entry.req_rate.requests = int(numbers[0])
+                            entry.req_rate.seconds = int(numbers[1])
+                        state = 2
         if state == 2:
             self._add_entry(entry)
 
-
     def can_fetch(self, useragent, url):
         """using the parsed robots.txt decide if useragent can fetch url"""
         if self.disallow_all:
@@ -153,6 +174,18 @@ class RobotFileParser:
         # agent not found ==> access granted
         return True
 
+    def crawl_delay(self, useragent):
+        for entry in self.entries:
+            if entry.applies_to(useragent):
+                return entry.delay
+        return None
+
+    def request_rate(self, useragent):
+        for entry in self.entries:
+            if entry.applies_to(useragent):
+                return entry.req_rate
+        return None
+
     def __str__(self):
         return ''.join([str(entry) + "\n" for entry in self.entries])
 
@@ -180,6 +213,8 @@ class Entry:
     def __init__(self):
         self.useragents = []
         self.rulelines = []
+        self.delay = None
+        self.req_rate = None
 
     def __str__(self):
         ret = []