Browse Source

Handle Latin-1 encoded text in diffs

This is useful for avoiding crashing on "cdiff -l" (on Python 3) in
https://github.com/myint/cppclean.

Also handle unknown encodings gracefully rather than crashing.
Steven Myint 10 years ago
parent
commit
095740253c
7 changed files with 67 additions and 8 deletions
  1. 2
    0
      .gitignore
  2. 22
    6
      cdiff.py
  3. 9
    0
      tests/latin1/in.diff
  4. 10
    0
      tests/latin1/out.normal
  5. 9
    0
      tests/latin1/out.side-by-side
  6. 9
    0
      tests/latin1/out.w70
  7. 6
    2
      tests/test_cdiff.py

+ 2
- 0
.gitignore View File

@@ -1,7 +1,9 @@
1
+.*.swp
1 2
 *.pyc
2 3
 *~
3 4
 *.tmp
4 5
 /.coverage
6
+/.travis-solo/
5 7
 /htmlcov/
6 8
 /MANIFEST
7 9
 /build/

+ 22
- 6
cdiff.py View File

@@ -36,6 +36,12 @@ import select
36 36
 import difflib
37 37
 
38 38
 
39
+try:
40
+    unicode
41
+except NameError:
42
+    unicode = str
43
+
44
+
39 45
 COLORS = {
40 46
     'reset'         : '\x1b[0m',
41 47
     'underline'     : '\x1b[4m',
@@ -248,7 +254,7 @@ class PatchStreamForwarder(object):
248 254
     def _forward_line(self):
249 255
         try:
250 256
             line = next(self._istream)
251
-            self._in.write(line.encode('utf-8'))
257
+            self._in.write(line)
252 258
         except StopIteration:
253 259
             self._in.close()
254 260
 
@@ -643,11 +649,17 @@ def revision_control_log(args):
643 649
 
644 650
 def decode(line):
645 651
     """Decode UTF-8 if necessary."""
646
-    try:
647
-        return line.decode('utf-8')
648
-    except (AttributeError, UnicodeDecodeError):
652
+    if isinstance(line, unicode):
649 653
         return line
650 654
 
655
+    for encoding in ['utf-8', 'latin1']:
656
+        try:
657
+            return line.decode(encoding)
658
+        except UnicodeDecodeError:
659
+            pass
660
+
661
+    return '*** cdiff: undecodable bytes ***\n'
662
+
651 663
 
652 664
 def main():
653 665
     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
@@ -715,7 +727,8 @@ def main():
715 727
             parser.print_help()
716 728
             return 1
717 729
     else:
718
-        diff_hdl = sys.stdin
730
+        diff_hdl = (sys.stdin.buffer if hasattr(sys.stdin, 'buffer')
731
+                    else sys.stdin)
719 732
 
720 733
     stream = PatchStream(diff_hdl)
721 734
 
@@ -730,8 +743,11 @@ def main():
730 743
         # pipe out stream untouched to make sure it is still a patch
731 744
         if sys.hexversion < 0x03000000:
732 745
             reload(sys).setdefaultencoding('utf8')
746
+
747
+        byte_output = (sys.stdout.buffer if hasattr(sys.stdout, 'buffer')
748
+                       else sys.stdout)
733 749
         for line in stream:
734
-            sys.stdout.write(decode(line))
750
+            byte_output.write(line)
735 751
 
736 752
     if diff_hdl is not sys.stdin:
737 753
         diff_hdl.close()

+ 9
- 0
tests/latin1/in.diff View File

@@ -0,0 +1,9 @@
1
+diff --git a/test/latin1.cc b/test/latin1.cc
2
+index a556e5c..d81bb0c 100644
3
+--- a/test/latin1.cc
4
++++ b/test/latin1.cc
5
+@@ -1,4 +1,3 @@
6
+-// é
7
+ int latin1()
8
+ {
9
+ 	static int x;

+ 10
- 0
tests/latin1/out.normal View File

@@ -0,0 +1,10 @@
1
+diff --git a/test/latin1.cc b/test/latin1.cc
2
+index a556e5c..d81bb0c 100644
3
+--- a/test/latin1.cc
4
++++ b/test/latin1.cc
5
+@@ -1,4 +1,3 @@
6
+-// é
7
+ int latin1()
8
+ {
9
+ 	static int x;
10
+

+ 9
- 0
tests/latin1/out.side-by-side View File

@@ -0,0 +1,9 @@
1
+diff --git a/test/latin1.cc b/test/latin1.cc
2
+index a556e5c..d81bb0c 100644
3
+--- a/test/latin1.cc
4
++++ b/test/latin1.cc
5
+@@ -1,4 +1,3 @@
6
+1 // é   
7
+2 int latin1()                                                                     1 int latin1()
8
+3 {                                                                                2 {
9
+4         static int x;                                                            3         static int x;

+ 9
- 0
tests/latin1/out.w70 View File

@@ -0,0 +1,9 @@
1
+diff --git a/test/latin1.cc b/test/latin1.cc
2
+index a556e5c..d81bb0c 100644
3
+--- a/test/latin1.cc
4
++++ b/test/latin1.cc
5
+@@ -1,4 +1,3 @@
6
+1 // é   
7
+2 int latin1()                                                           1 int latin1()
8
+3 {                                                                      2 {
9
+4         static int x;                                                  3         static int x;

+ 6
- 2
tests/test_cdiff.py View File

@@ -77,9 +77,13 @@ class DecodeTest(unittest.TestCase):
77 77
         utext = 'hello'.encode('utf-8')
78 78
         self.assertEqual('hello', cdiff.decode(utext))
79 79
 
80
-    def test_malformed_utf8(self):
80
+    def test_latin_1(self):
81 81
         text = '\x80\x02q\x01(U'
82
-        self.assertEqual(text, cdiff.decode(text))
82
+        if sys.version_info[0] == 2:
83
+            decoded_text = text.decode('latin-1')
84
+        else:
85
+            decoded_text = text
86
+        self.assertEqual(decoded_text, cdiff.decode(text))
83 87
 
84 88
 
85 89
 class HunkTest(unittest.TestCase):