From f7ed5d111bbe30b36b2629a87d9fcc291de4aafb Mon Sep 17 00:00:00 2001
From: Ezio Melotti <ezio.melotti@gmail.com>
Date: Sun, 4 Nov 2012 23:21:38 +0200
Subject: #8271: the utf-8 decoder now outputs the correct number of U+FFFD 
 characters when used with the "replace" error handler on invalid utf-8
 sequences.  Patch by Serhiy Storchaka, tests by Ezio Melotti.

---
 Objects/unicodeobject.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'Objects/unicodeobject.c')

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index f61f9d0df19..665f03d8849 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4759,9 +4759,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
                 goto End;
             errmsg = "unexpected end of data";
             startinpos = s - starts;
-            endinpos = startinpos + 1;
-            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
-                endinpos++;
+            endinpos = end - starts;
             break;
         case 1:
             errmsg = "invalid start byte";
@@ -4769,11 +4767,11 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
             endinpos = startinpos + 1;
             break;
         case 2:
+        case 3:
+        case 4:
             errmsg = "invalid continuation byte";
             startinpos = s - starts;
-            endinpos = startinpos + 1;
-            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
-                endinpos++;
+            endinpos = startinpos + ch - 1;
             break;
         default:
             if (unicode_putchar(&unicode, &outpos, ch) < 0)
-- 
cgit v1.2.3