Browse Source

Handle Latin-1 encoded text in diffs

This is useful for avoiding crashing on "cdiff -l" (on Python 3) in
https://github.com/myint/cppclean.

Also handle unknown encodings gracefully rather than crashing.
Steven Myint 11 years ago
parent
commit
095740253c
7 changed files with 67 additions and 8 deletions
  1. 2
    0
      .gitignore
  2. 22
    6
      cdiff.py
  3. 9
    0
      tests/latin1/in.diff
  4. 10
    0
      tests/latin1/out.normal
  5. 9
    0
      tests/latin1/out.side-by-side
  6. 9
    0
      tests/latin1/out.w70
  7. 6
    2
      tests/test_cdiff.py

+ 2
- 0
.gitignore View File

1
+.*.swp
1
 *.pyc
2
 *.pyc
2
 *~
3
 *~
3
 *.tmp
4
 *.tmp
4
 /.coverage
5
 /.coverage
6
+/.travis-solo/
5
 /htmlcov/
7
 /htmlcov/
6
 /MANIFEST
8
 /MANIFEST
7
 /build/
9
 /build/

+ 22
- 6
cdiff.py View File

36
 import difflib
36
 import difflib
37
 
37
 
38
 
38
 
39
+try:
40
+    unicode
41
+except NameError:
42
+    unicode = str
43
+
44
+
39
 COLORS = {
45
 COLORS = {
40
     'reset'         : '\x1b[0m',
46
     'reset'         : '\x1b[0m',
41
     'underline'     : '\x1b[4m',
47
     'underline'     : '\x1b[4m',
248
     def _forward_line(self):
254
     def _forward_line(self):
249
         try:
255
         try:
250
             line = next(self._istream)
256
             line = next(self._istream)
251
-            self._in.write(line.encode('utf-8'))
257
+            self._in.write(line)
252
         except StopIteration:
258
         except StopIteration:
253
             self._in.close()
259
             self._in.close()
254
 
260
 
643
 
649
 
644
 def decode(line):
650
 def decode(line):
645
     """Decode UTF-8 if necessary."""
651
     """Decode UTF-8 if necessary."""
646
-    try:
647
-        return line.decode('utf-8')
648
-    except (AttributeError, UnicodeDecodeError):
652
+    if isinstance(line, unicode):
649
         return line
653
         return line
650
 
654
 
655
+    for encoding in ['utf-8', 'latin1']:
656
+        try:
657
+            return line.decode(encoding)
658
+        except UnicodeDecodeError:
659
+            pass
660
+
661
+    return '*** cdiff: undecodable bytes ***\n'
662
+
651
 
663
 
652
 def main():
664
 def main():
653
     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
665
     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
715
             parser.print_help()
727
             parser.print_help()
716
             return 1
728
             return 1
717
     else:
729
     else:
718
-        diff_hdl = sys.stdin
730
+        diff_hdl = (sys.stdin.buffer if hasattr(sys.stdin, 'buffer')
731
+                    else sys.stdin)
719
 
732
 
720
     stream = PatchStream(diff_hdl)
733
     stream = PatchStream(diff_hdl)
721
 
734
 
730
         # pipe out stream untouched to make sure it is still a patch
743
         # pipe out stream untouched to make sure it is still a patch
731
         if sys.hexversion < 0x03000000:
744
         if sys.hexversion < 0x03000000:
732
             reload(sys).setdefaultencoding('utf8')
745
             reload(sys).setdefaultencoding('utf8')
746
+
747
+        byte_output = (sys.stdout.buffer if hasattr(sys.stdout, 'buffer')
748
+                       else sys.stdout)
733
         for line in stream:
749
         for line in stream:
734
-            sys.stdout.write(decode(line))
750
+            byte_output.write(line)
735
 
751
 
736
     if diff_hdl is not sys.stdin:
752
     if diff_hdl is not sys.stdin:
737
         diff_hdl.close()
753
         diff_hdl.close()

+ 9
- 0
tests/latin1/in.diff View File

1
+diff --git a/test/latin1.cc b/test/latin1.cc
2
+index a556e5c..d81bb0c 100644
3
+--- a/test/latin1.cc
4
++++ b/test/latin1.cc
5
+@@ -1,4 +1,3 @@
6
+-// é
7
+ int latin1()
8
+ {
9
+ 	static int x;

+ 10
- 0
tests/latin1/out.normal View File

1
+diff --git a/test/latin1.cc b/test/latin1.cc
2
+index a556e5c..d81bb0c 100644
3
+--- a/test/latin1.cc
4
++++ b/test/latin1.cc
5
+@@ -1,4 +1,3 @@
6
+-// é
7
+ int latin1()
8
+ {
9
+ 	static int x;
10
+

+ 9
- 0
tests/latin1/out.side-by-side View File

1
+diff --git a/test/latin1.cc b/test/latin1.cc
2
+index a556e5c..d81bb0c 100644
3
+--- a/test/latin1.cc
4
++++ b/test/latin1.cc
5
+@@ -1,4 +1,3 @@
6
+1 // é   
7
+2 int latin1()                                                                     1 int latin1()
8
+3 {                                                                                2 {
9
+4         static int x;                                                            3         static int x;

+ 9
- 0
tests/latin1/out.w70 View File

1
+diff --git a/test/latin1.cc b/test/latin1.cc
2
+index a556e5c..d81bb0c 100644
3
+--- a/test/latin1.cc
4
++++ b/test/latin1.cc
5
+@@ -1,4 +1,3 @@
6
+1 // é   
7
+2 int latin1()                                                           1 int latin1()
8
+3 {                                                                      2 {
9
+4         static int x;                                                  3         static int x;

+ 6
- 2
tests/test_cdiff.py View File

77
         utext = 'hello'.encode('utf-8')
77
         utext = 'hello'.encode('utf-8')
78
         self.assertEqual('hello', cdiff.decode(utext))
78
         self.assertEqual('hello', cdiff.decode(utext))
79
 
79
 
80
-    def test_malformed_utf8(self):
80
+    def test_latin_1(self):
81
         text = '\x80\x02q\x01(U'
81
         text = '\x80\x02q\x01(U'
82
-        self.assertEqual(text, cdiff.decode(text))
82
+        if sys.version_info[0] == 2:
83
+            decoded_text = text.decode('latin-1')
84
+        else:
85
+            decoded_text = text
86
+        self.assertEqual(decoded_text, cdiff.decode(text))
83
 
87
 
84
 
88
 
85
 class HunkTest(unittest.TestCase):
89
 class HunkTest(unittest.TestCase):