summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGuillaume Seguin <guillaume@segu.in>2007-12-15 22:56:49 +0100
committerGuillaume Seguin <guillaume@segu.in>2007-12-15 22:56:49 +0100
commit35ee832bf430e26572ce4f1e48f4fbb0b52cb84a (patch)
tree756e3dd0edce16fa6e691fa921868871f13db1c8
parent57051bef756aff7ff21c7a079a27fe16cda2478b (diff)
downloadcamlui-35ee832bf430e26572ce4f1e48f4fbb0b52cb84a.tar.gz
camlui-35ee832bf430e26572ce4f1e48f4fbb0b52cb84a.tar.bz2
* Try to detect encoding when opening a file
-rw-r--r--camlui/decodeh.py97
-rw-r--r--camlui/document.py4
2 files changed, 100 insertions, 1 deletions
diff --git a/camlui/decodeh.py b/camlui/decodeh.py
new file mode 100644
index 0000000..b1025f9
--- /dev/null
+++ b/camlui/decodeh.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+
+"""Demonstrate one heuristic approach to determining a string's encoding.
+From http://www.webfast.com/~skip/python/,
+by Skip Montanaro <skip@mojam.com>"""
+
+import sys
+import re
+
+def decode_heuristically(s, enc=None, denc=sys.getdefaultencoding()):
+ """Try interpreting s using several possible encodings.
+ The return value is a three-element tuple. The first element is either an
+ ASCII string or a Unicode object. The second element is 1
+ if the decoder had to punt and delete some characters from the input
+ to successfully generate a Unicode object."""
+ if isinstance(s, unicode):
+ return s, 0, "utf-8"
+ try:
+ x = unicode(s, "ascii")
+ # if it's ascii, we're done
+ return s, 0, "ascii"
+ except UnicodeError:
+ encodings = ["utf-8","iso-8859-1","cp1252","iso-8859-15"]
+ # if the default encoding is not ascii it's a good thing to try
+ if denc != "ascii": encodings.insert(0, denc)
+ # always try any caller-provided encoding first
+ if enc: encodings.insert(0, enc)
+ for enc in encodings:
+
+ # Most of the characters between 0x80 and 0x9F are displayable
+ # in cp1252 but are control characters in iso-8859-1. Skip
+ # iso-8859-1 if they are found, even though the unicode() call
+ # might well succeed.
+
+ if (enc in ("iso-8859-15", "iso-8859-1") and
+ re.search(r"[\x80-\x9f]", s) is not None):
+ continue
+
+ # Characters in the given range are more likely to be
+ # symbols used in iso-8859-15, so even though unicode()
+ # may accept such strings with those encodings, skip them.
+
+ if (enc in ("iso-8859-1", "cp1252") and
+ re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", s) is not None):
+ continue
+
+ try:
+ x = unicode(s, enc)
+ except UnicodeError:
+ pass
+ else:
+ if x.encode(enc) == s:
+ return x, 0, enc
+
+ # nothing worked perfectly - try again, but use the "ignore" parameter
+ # and return the longest result
+ output = [(unicode(s, enc, "ignore"), enc) for enc in encodings]
+ output = [(len(x[0]), x) for x in output]
+ output.sort()
+ x, enc = output[-1][1]
+ return x, 1, enc
+
+def decode_by_counting(s, enc=None, denc=sys.getdefaultencoding(),
+ _str="strict"):
+ """Try interpreting s using several possible encodings.
+ The return value is as above for decode_heuristically but uses
+ a different method from David Eppstein:
+ http://mail.python.org/pipermail/python-list/2004-April/215185.html
+ """
+ if isinstance(s, unicode):
+ return s, 0, "utf-8"
+ try:
+ x = unicode(s, "ascii")
+ # if it's ascii, we're done
+ return s, 0, "ascii"
+ except UnicodeError:
+ encodings = ["utf-8","iso-8859-1","cp1252","iso-8859-15"]
+ # if the default encoding is not ascii it's a good thing to try
+ if denc != "ascii": encodings.insert(0, denc)
+ # always try any caller-provided encoding first
+ if enc: encodings.insert(0, enc)
+ scores = []
+ for enc in encodings:
+ try:
+ x = unicode(s, enc, _str)
+ except UnicodeError:
+ score = -len(s)
+ else:
+ score = len([c for c in x if c.isalnum() or c.isspace()])
+ scores.append((score, x, enc))
+ if scores:
+ scores.sort()
+ score, x, enc = scores[-1]
+ return x, 0, enc
+ if _str == "strict":
+ x, punt, enc = decode_by_counting(s, enc, denc, "ignore")
+ return x, 1, enc
diff --git a/camlui/document.py b/camlui/document.py
index 3e30b70..a13eea7 100644
--- a/camlui/document.py
+++ b/camlui/document.py
@@ -23,6 +23,7 @@
import os
from dialogs import *
+from decodeh import *
class Document:
'''Simple document object capable of opening/saving files'''
@@ -43,7 +44,8 @@ class Document:
def load (self):
'''Load a file from disk'''
f = open (self.path, "r")
- self.data = f.read ()
+ data = f.read ()
+ self.data = decode_heuristically (data)[0]
f.close ()
self.buffer.begin_not_undoable_action ()
self.buffer.set_text (self.data)