summaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorDamien George <damien.p.george@gmail.com>2014-07-16 11:45:10 +0100
committerDamien George <damien.p.george@gmail.com>2014-07-19 18:34:04 +0100
commit1694bc733d3ace8072c58e7da457dd2995998189 (patch)
tree66ec95d356955b18f8944acc5feb0ce9d6b5390f
parent02bc882c3dbfcfd6c825cddf53ededac9af65f02 (diff)
downloadmicropython-1694bc733d3ace8072c58e7da457dd2995998189.tar.gz
micropython-1694bc733d3ace8072c58e7da457dd2995998189.zip
py: Add stream reading of n unicode chars; unicode support by default.
With unicode enabled, this patch allows reading a fixed number of characters from text-mode streams; eg file.read(5) will read 5 unicode chars, which can made of more than 5 bytes. For an ASCII stream (ie no chars > 127) it only needs to do 1 read. If there are lots of non-ASCII chars in a stream, then it needs multiple reads of the underlying object. Adds a new test for this case. Enables unicode support by default on unix and stmhal ports.
-rw-r--r--py/stream.c88
-rw-r--r--stmhal/mpconfigport.h2
-rwxr-xr-xtests/run-tests2
-rw-r--r--tests/unicode/data/utf-8_2.txt1
-rw-r--r--tests/unicode/file2.py12
-rw-r--r--unix/mpconfigport.h2
6 files changed, 103 insertions, 4 deletions
diff --git a/py/stream.c b/py/stream.c
index 2b4410728f..4c8b8a570a 100644
--- a/py/stream.c
+++ b/py/stream.c
@@ -67,6 +67,9 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) {
nlr_raise(mp_obj_new_exception_msg(&mp_type_OSError, "Operation not supported"));
}
+ // What to do if sz < -1? Python docs don't specify this case.
+ // CPython does a readall, but here we silently let negatives through,
+ // and they will cause a MemoryError.
mp_int_t sz;
if (n_args == 1 || ((sz = mp_obj_get_int(args[1])) == -1)) {
return stream_readall(args[0]);
@@ -74,7 +77,90 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) {
#if MICROPY_PY_BUILTINS_STR_UNICODE
if (!o->type->stream_p->is_bytes) {
- mp_not_implemented("Reading from unicode text streams by character count");
+ // We need to read sz number of unicode characters. Because we don't have any
+ // buffering, and because the stream API can only read bytes, we must read here
+ // in units of bytes and must never over read. If we want sz chars, then reading
+ // sz bytes will never over-read, so we follow this approach, in a loop to keep
+ // reading until we have exactly enough chars. This will be 1 read for text
+ // with ASCII-only chars, and about 2 reads for text with a couple of non-ASCII
+ // chars. For text with lots of non-ASCII chars, it'll be pretty inefficient
+ // in time and memory.
+
+ vstr_t vstr;
+ vstr_init(&vstr, sz);
+ mp_uint_t more_bytes = sz;
+ mp_uint_t last_buf_offset = 0;
+ while (more_bytes > 0) {
+ char *p = vstr_add_len(&vstr, more_bytes);
+ if (p == NULL) {
+ nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_MemoryError, "out of memory"));
+ }
+ int error;
+ mp_int_t out_sz = o->type->stream_p->read(o, p, more_bytes, &error);
+ if (out_sz == -1) {
+ vstr_cut_tail_bytes(&vstr, more_bytes);
+ if (is_nonblocking_error(error)) {
+ // With non-blocking streams, we read as much as we can.
+ // If we read nothing, return None, just like read().
+ // Otherwise, return data read so far.
+ // TODO what if we have read only half a non-ASCII char?
+ if (vstr.len == 0) {
+ vstr_clear(&vstr);
+ return mp_const_none;
+ }
+ break;
+ }
+ nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_OSError, "[Errno %d]", error));
+ }
+
+ if (out_sz == 0) {
+ // Finish reading.
+ // TODO what if we have read only half a non-ASCII char?
+ vstr_cut_tail_bytes(&vstr, more_bytes);
+ break;
+ }
+
+ // count chars from bytes just read
+ for (mp_uint_t off = last_buf_offset;;) {
+ byte b = vstr.buf[off];
+ int n;
+ if (!UTF8_IS_NONASCII(b)) {
+ // 1-byte ASCII char
+ n = 1;
+ } else if ((b & 0xe0) == 0xc0) {
+ // 2-byte char
+ n = 2;
+ } else if ((b & 0xf0) == 0xe0) {
+ // 3-byte char
+ n = 3;
+ } else if ((b & 0xf8) == 0xf0) {
+ // 4-byte char
+ n = 4;
+ } else {
+ // TODO
+ n = 5;
+ }
+ if (off + n <= vstr.len) {
+ // got a whole char in n bytes
+ off += n;
+ sz -= 1;
+ last_buf_offset = off;
+ if (off >= vstr.len) {
+ more_bytes = sz;
+ break;
+ }
+ } else {
+ // didn't get a whole char, so work out how many extra bytes are needed for
+ // this partial char, plus bytes for additional chars that we want
+ more_bytes = (off + n - vstr.len) + (sz - 1);
+ break;
+ }
+ }
+ }
+
+ mp_obj_t ret = mp_obj_new_str_of_type(&mp_type_str, (byte*)vstr.buf, vstr.len);
+ vstr_clear(&vstr);
+ return ret;
}
#endif
diff --git a/stmhal/mpconfigport.h b/stmhal/mpconfigport.h
index 00afa989cc..95f142ca48 100644
--- a/stmhal/mpconfigport.h
+++ b/stmhal/mpconfigport.h
@@ -44,7 +44,7 @@
*/
#define MICROPY_ENABLE_LFN (1)
#define MICROPY_LFN_CODE_PAGE (437) /* 1=SFN/ANSI 437=LFN/U.S.(OEM) */
-#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
+#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
#define MICROPY_PY_BUILTINS_FROZENSET (1)
#define MICROPY_PY_SYS_EXIT (1)
#define MICROPY_PY_SYS_STDFILES (1)
diff --git a/tests/run-tests b/tests/run-tests
index 71a94f946f..4b48421ded 100755
--- a/tests/run-tests
+++ b/tests/run-tests
@@ -134,7 +134,7 @@ def main():
if args.test_dirs is None:
if pyb is None:
# run PC tests
- test_dirs = ('basics', 'micropython', 'float', 'import', 'io', 'misc')
+ test_dirs = ('basics', 'micropython', 'float', 'import', 'io', 'misc', 'unicode')
else:
# run pyboard tests
test_dirs = ('basics', 'micropython', 'float', 'pyb', 'pybnative', 'inlineasm')
diff --git a/tests/unicode/data/utf-8_2.txt b/tests/unicode/data/utf-8_2.txt
new file mode 100644
index 0000000000..ab0eaa4e0d
--- /dev/null
+++ b/tests/unicode/data/utf-8_2.txt
@@ -0,0 +1 @@
+aαbβcγdδ
diff --git a/tests/unicode/file2.py b/tests/unicode/file2.py
new file mode 100644
index 0000000000..aca2e0e0ed
--- /dev/null
+++ b/tests/unicode/file2.py
@@ -0,0 +1,12 @@
+# test reading a given number of characters
+
+def do(mode):
+ f = open('unicode/data/utf-8_2.txt', mode)
+ print(f.read(1))
+ print(f.read(1))
+ print(f.read(2))
+ print(f.read(4))
+ f.close()
+
+do('rb')
+do('rt')
diff --git a/unix/mpconfigport.h b/unix/mpconfigport.h
index 0831e3fd34..ce4365d365 100644
--- a/unix/mpconfigport.h
+++ b/unix/mpconfigport.h
@@ -43,7 +43,7 @@
#define MICROPY_LONGINT_IMPL (MICROPY_LONGINT_IMPL_MPZ)
#define MICROPY_STREAMS_NON_BLOCK (1)
#define MICROPY_OPT_COMPUTED_GOTO (1)
-#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
+#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
#define MICROPY_PY_BUILTINS_FROZENSET (1)
#define MICROPY_PY_SYS_EXIT (1)
#define MICROPY_PY_SYS_PLATFORM "linux"