summaryrefslogtreecommitdiffstatshomepage
path: root/tests/phpunit/data/formatting/utf-8
diff options
context:
space:
mode:
authorAndrew Nacin <nacin@git.wordpress.org>2013-08-29 18:39:34 +0000
committerAndrew Nacin <nacin@git.wordpress.org>2013-08-29 18:39:34 +0000
commit8045afd81b7c80f6ef5b327c115a5bbb43e4b65c (patch)
tree15d457007610c451577debda89bd9e9cd3d74551 /tests/phpunit/data/formatting/utf-8
parentd34baebc1d8111c9c1014e11001957face778e52 (diff)
downloadwordpress-8045afd81b7c80f6ef5b327c115a5bbb43e4b65c.tar.gz
wordpress-8045afd81b7c80f6ef5b327c115a5bbb43e4b65c.zip
Move PHPUnit tests into a tests/phpunit directory.
wp-tests-config.php can/should reside in the root of a develop checkout. `phpunit` should be run from the root. see #25088. git-svn-id: https://develop.svn.wordpress.org/trunk@25165 602fd350-edb4-49c9-b593-d223f7449a82
Diffstat (limited to 'tests/phpunit/data/formatting/utf-8')
-rw-r--r--tests/phpunit/data/formatting/utf-8/README15
-rw-r--r--tests/phpunit/data/formatting/utf-8/entitize.py24
-rw-r--r--tests/phpunit/data/formatting/utf-8/entitized.txt5
-rw-r--r--tests/phpunit/data/formatting/utf-8/u-urlencode.py24
-rw-r--r--tests/phpunit/data/formatting/utf-8/u-urlencoded.txt5
-rw-r--r--tests/phpunit/data/formatting/utf-8/urlencode.py33
-rw-r--r--tests/phpunit/data/formatting/utf-8/urlencoded.txt5
-rw-r--r--tests/phpunit/data/formatting/utf-8/utf-8.txt5
8 files changed, 116 insertions, 0 deletions
diff --git a/tests/phpunit/data/formatting/utf-8/README b/tests/phpunit/data/formatting/utf-8/README
new file mode 100644
index 0000000000..5bc6a317d3
--- /dev/null
+++ b/tests/phpunit/data/formatting/utf-8/README
@@ -0,0 +1,15 @@
+The Python scripts are for generating test data, because Python's Unicode
+support is much, much, much, much better than PHP's.
+
+ * `utf-8/urlencode.py`, `utf-8/u-urlencode.py` and `utf-8/entitize.py` process UTF-8
+ into a few different formats (%-encoding, %u-encoding, &#decimal;)
+ and are used like normal UNIXy pipes.
+
+ Try:
+
+ `python urlencode.py < utf-8.txt > urlencoded.txt`
+ `python u-urlencode.py < utf-8.txt > u-urlencoded.txt`
+ `python entitize.py < utf-8.txt > entitized.txt`
+
+ * `windows-1252.py` converts Windows-only smart-quotes and things
+ into their unicode &#decimal reference; equivalents.
diff --git a/tests/phpunit/data/formatting/utf-8/entitize.py b/tests/phpunit/data/formatting/utf-8/entitize.py
new file mode 100644
index 0000000000..efa7cb18d5
--- /dev/null
+++ b/tests/phpunit/data/formatting/utf-8/entitize.py
@@ -0,0 +1,24 @@
+# Generates entitized.txt from utf-8.txt
+#
+# entitized.txt is used by Tests_Formatting_UrlEncodedToEntities
+
+import codecs
+import sys
+
+def entitize(line):
+ """Convert text to &#[dec]; entities."""
+ line = line.strip();
+ line = ["&#%d;" % ord(s) for s in line]
+ return "".join(line)
+
+if __name__ == "__main__":
+ args = sys.argv[1:]
+ if args and args[0] in ("-h", "--help"):
+ print "Usage: python entitize.py < utf-8.txt > entitized.txt"
+ sys.exit(2)
+
+ sys.stdin = codecs.getreader("utf-8")(sys.stdin)
+ sys.stdout = codecs.getwriter("ascii")(sys.stdout)
+
+ lines = sys.stdin.readlines()
+ sys.stdout.write( "\n".join(map(entitize, lines)) )
diff --git a/tests/phpunit/data/formatting/utf-8/entitized.txt b/tests/phpunit/data/formatting/utf-8/entitized.txt
new file mode 100644
index 0000000000..a29c9f9216
--- /dev/null
+++ b/tests/phpunit/data/formatting/utf-8/entitized.txt
@@ -0,0 +1,5 @@
+&#31456;&#23376;&#24609;
+&#70;&#114;&#97;&#110;&#231;&#111;&#105;&#115;&#32;&#84;&#114;&#117;&#102;&#102;&#97;&#117;&#116;
+&#4321;&#4304;&#4325;&#4304;&#4320;&#4311;&#4309;&#4308;&#4314;&#4317;
+&#66;&#106;&#246;&#114;&#107;&#32;&#71;&#117;&#240;&#109;&#117;&#110;&#100;&#115;&#100;&#243;&#116;&#116;&#105;&#114;
+&#23470;&#23822;&#12288;&#39423; \ No newline at end of file
diff --git a/tests/phpunit/data/formatting/utf-8/u-urlencode.py b/tests/phpunit/data/formatting/utf-8/u-urlencode.py
new file mode 100644
index 0000000000..c20a14f1f8
--- /dev/null
+++ b/tests/phpunit/data/formatting/utf-8/u-urlencode.py
@@ -0,0 +1,24 @@
+# Generates u-urlencoded.txt from utf-8.txt
+#
+# u-urlencoded.txt is used by Tests_Formatting_UrlEncodedToEntities
+
+import codecs
+import sys
+
+def uurlencode(line):
+ """Use %u[hexvalue] percent encoding."""
+ line = line.strip()
+ line = ["%%u%04X" % ord(s) for s in line]
+ return "".join(line)
+
+if __name__ == "__main__":
+ args = sys.argv[1:]
+ if args and args[0] in ("-h", "--help"):
+ print "Usage: python u-urlencode.py < utf-8.txt > u-urlencoded.txt"
+ sys.exit(2)
+
+ sys.stdin = codecs.getreader("utf-8")(sys.stdin)
+ sys.stdout = codecs.getwriter("ascii")(sys.stdout)
+
+ lines = sys.stdin.readlines()
+ sys.stdout.write( "\n".join(map(uurlencode, lines)) )
diff --git a/tests/phpunit/data/formatting/utf-8/u-urlencoded.txt b/tests/phpunit/data/formatting/utf-8/u-urlencoded.txt
new file mode 100644
index 0000000000..ad4e422c75
--- /dev/null
+++ b/tests/phpunit/data/formatting/utf-8/u-urlencoded.txt
@@ -0,0 +1,5 @@
+%u7AE0%u5B50%u6021
+%u0046%u0072%u0061%u006E%u00E7%u006F%u0069%u0073%u0020%u0054%u0072%u0075%u0066%u0066%u0061%u0075%u0074
+%u10E1%u10D0%u10E5%u10D0%u10E0%u10D7%u10D5%u10D4%u10DA%u10DD
+%u0042%u006A%u00F6%u0072%u006B%u0020%u0047%u0075%u00F0%u006D%u0075%u006E%u0064%u0073%u0064%u00F3%u0074%u0074%u0069%u0072
+%u5BAE%u5D0E%u3000%u99FF
diff --git a/tests/phpunit/data/formatting/utf-8/urlencode.py b/tests/phpunit/data/formatting/utf-8/urlencode.py
new file mode 100644
index 0000000000..d29907f24b
--- /dev/null
+++ b/tests/phpunit/data/formatting/utf-8/urlencode.py
@@ -0,0 +1,33 @@
+# Generates urlencoded.txt from utf-8.txt
+#
+# urlencoded.txt is used by Tests_Formatting_Utf8UriEncode
+
+import urllib, codecs, re
+import sys
+
+# uncapitalize pct-encoded values, leave the rest alone
+capfix = re.compile("%([0-9A-Z]{2})");
+def fix(match):
+ octet = match.group(1)
+ intval = int(octet, 16)
+ if intval < 128:
+ return chr(intval).lower()
+ return '%' + octet.lower()
+
+def urlencode(line):
+ """Percent-encode each byte of non-ASCII unicode characters."""
+ line = urllib.quote(line.strip().encode("utf-8"))
+ line = capfix.sub(fix, line)
+ return line
+
+if __name__ == "__main__":
+ args = sys.argv[1:]
+ if args and args[0] in ("-h", "--help"):
+ print "Usage: python urlencode.py < utf-8.txt > urlencoded.txt"
+ sys.exit(2)
+
+ sys.stdin = codecs.getreader("utf-8")(sys.stdin)
+ sys.stdout = codecs.getwriter("ascii")(sys.stdout)
+
+ lines = sys.stdin.readlines()
+ sys.stdout.write( "\n".join(map(urlencode, lines)) )
diff --git a/tests/phpunit/data/formatting/utf-8/urlencoded.txt b/tests/phpunit/data/formatting/utf-8/urlencoded.txt
new file mode 100644
index 0000000000..930bf13ff6
--- /dev/null
+++ b/tests/phpunit/data/formatting/utf-8/urlencoded.txt
@@ -0,0 +1,5 @@
+%e7%ab%a0%e5%ad%90%e6%80%a1
+Fran%c3%a7ois Truffaut
+%e1%83%a1%e1%83%90%e1%83%a5%e1%83%90%e1%83%a0%e1%83%97%e1%83%95%e1%83%94%e1%83%9a%e1%83%9d
+Bj%c3%b6rk Gu%c3%b0mundsd%c3%b3ttir
+%e5%ae%ae%e5%b4%8e%e3%80%80%e9%a7%bf
diff --git a/tests/phpunit/data/formatting/utf-8/utf-8.txt b/tests/phpunit/data/formatting/utf-8/utf-8.txt
new file mode 100644
index 0000000000..1596029d20
--- /dev/null
+++ b/tests/phpunit/data/formatting/utf-8/utf-8.txt
@@ -0,0 +1,5 @@
+章子怡
+François Truffaut
+საქართველო
+Björk Guðmundsdóttir
+宮崎 駿