blob: d29907f24bbec670dbcb82fcae8528062d352e6a (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
# Generates urlencoded.txt from utf-8.txt
#
# urlencoded.txt is used by Tests_Formatting_Utf8UriEncode
import urllib, codecs, re
import sys
# uncapitalize pct-encoded values, leave the rest alone
capfix = re.compile("%([0-9A-Z]{2})");
def fix(match):
octet = match.group(1)
intval = int(octet, 16)
if intval < 128:
return chr(intval).lower()
return '%' + octet.lower()
def urlencode(line):
"""Percent-encode each byte of non-ASCII unicode characters."""
line = urllib.quote(line.strip().encode("utf-8"))
line = capfix.sub(fix, line)
return line
if __name__ == "__main__":
args = sys.argv[1:]
if args and args[0] in ("-h", "--help"):
print "Usage: python urlencode.py < utf-8.txt > urlencoded.txt"
sys.exit(2)
sys.stdin = codecs.getreader("utf-8")(sys.stdin)
sys.stdout = codecs.getwriter("ascii")(sys.stdout)
lines = sys.stdin.readlines()
sys.stdout.write( "\n".join(map(urlencode, lines)) )
|