0.3.1

Text-extend-tools · May 12, 2018 · 994eb2f · 994eb2f
1 parent a23516a
commit 994eb2f
Show file tree

Hide file tree

Showing 18 changed files with 18,518 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+.idea/
+*.pyc
+*.egg-info
+*.swp
+PKG-INFO
+/dist
+/build
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -0,0 +1,22 @@
+rurecoder
+=======
+
+Changelog
+---------
+
+v0.1.0
+ - Реалиизация базовой функциональности.
+
+v0.2.0
+ - Добавлние декодеров. Теперь умеет декодить такие кракозябры (взял примеры на 2cyr.com):
+ - - &egrave;&eth;&egrave;&euml;&egrave;&ouml;&agrave;
+ - - %D0%A2%D0%BE%D0%B2%D0%B0+%D0%B5+%D0%BA
+ - - &#229;&#228;&#237;&#224; &#227;&#238;&#228;&#232;
+ - - &#1080;&#1088;&#1080;&#1083;&#1080;&#1094;&#1072;
+
+v0.3.0
+ - Добавлена поддержка python3.
+
+v0.3.1
+ - Удаление из зависимостей пакета regex.
+ - Минорные фиксы.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,2 @@
+include README.rst *.py
+recursive-include rurecoder *.py *.json
diff --git a/README.rst b/README.rst
@@ -0,0 +1,91 @@
+rurecoder
+=======
+
+
+Назначение
+----------
+
+Пакет нужен чтобы чинить "кракозябры" (или "краказябры") в читаемый текст. Например: "õîğîøèé òåêñò" => "хооший текст".
+
+
+Установка
+---------
+::
+
+    $ git clone https://github.com/zvezdochiot/python-rurecoder #(based https://bitbucket.org/dkuryakin/recoder.git)
+    $ cd python-rurecoder && python setup.py install
+
+или
+::
+
+    $ pip install rurecoder
+
+Полезные команды
+----------------
+
+Использование как консольная тулза.
+::
+
+    $ echo "Îñíîâíàÿ Îëèìïèéñêàÿ äåðåâíÿ â" | python -mrecoder [coding]
+
+По умолчанию, coding=utf-8.
+
+Использование в коде
+--------------------
+
+Чаще всего с кракозябрами справится такой базовый пример:
+
+.. code-block:: python
+
+    from rurecoder.cyrillic import Recoder
+    rec = Recoder()
+    broken_text = u'Îñíîâíàÿ Îëèìïèéñêàÿ äåðåâíÿ â'
+    fixed_text = rec.fix_common(broken_text)
+    print fixed_text.encode('utf-8')
+
+
+Если базовый пример не справился, можно поиграться с настройками:
+
+.. code-block:: python
+
+    from rurecoder.cyrillic import Recoder
+    rec = Recoder(depth=4)
+    broken_text = u'...'
+    fixed_text = rec.fix(broken_text)  # fix работает дольше и сложнее чем fix_common
+    ...
+
+
+Можно использовать частоупотребимые слова (и, на, к, в, ...) как индикатор успеха перекодировки. Но в этом случае текст починится только если в нём есть эти слова:
+
+.. code-block:: python
+
+    from rurecoder.cyrillic import Recoder
+    rec = Recoder(use_plus_words=True)
+    ...
+
+
+Замечания
+---------
+
+В данный момент поддерживается только кириллица.
+
+Расширение
+----------
+
+Если хочется расширить библиотеку не только кириллицей, предусмотренна удобная тулза:
+::
+
+    $ cat some_learning_text.txt | python -mrurecoder.builder [coding]
+
+По-умолчанию, coding=utf-8. На stdin подавать текстовку для обучения. На выходе получится 2 файлика: 3grams.json и plus_words.json. Далее всё делается по аналогии с recoder.cyrillic.
+
+Тесты
+-----
+
+Тут всё просто:
+::
+
+    $ git clone https://github.com/zvezdochiot/python-rurecoder #(based https://bitbucket.org/dkuryakin/recoder.git)
+    $ cd python-rurecoder && python setup.py test
+
+See also CHANGELOG.rst
diff --git a/bin/rurecoder b/bin/rurecoder
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+import sys, os
+
+# Try to detect where it is run from and set prefix and the search path.
+# It is assumed that the user installed cpuset using the --prefix= option
+prefix, bin = os.path.split(sys.path[0])
+
+if bin == 'bin' and prefix != sys.prefix:
+    sys.prefix = prefix
+    sys.exec_prefix = prefix
+
+    major, minor = sys.version_info[0:2]
+    local_path = [os.path.join(prefix, 'lib', 'python'),
+                  os.path.join(prefix, 'lib', 'python%s.%s' % (major, minor)),
+                  os.path.join(prefix, 'lib', 'python%s.%s' % (major, minor),
+                               'site-packages')]
+    sys.path = local_path + sys.path
+
+from rurecoder.__main__ import main
+
+if __name__ == '__main__':
+    main()
diff --git a/rurecoder/__init__.py b/rurecoder/__init__.py
@@ -0,0 +1,5 @@
+from __future__ import absolute_import
+
+from . import tests
+from . import cyrillic
+from . import base_recoder
diff --git a/rurecoder/__main__.py b/rurecoder/__main__.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+import sys
+from . import cyrillic
+from .pyver import *
+
+coding = sys.argv[1] if len(sys.argv) > 1 else 'utf-8'
+
+if pyver == 2:
+    input_data = sys.stdin.read().decode(coding, errors='ignore')
+elif pyver == 3:
+    if len(sys.argv) == 1:
+        input_data = sys.stdin.read()
+    else:
+        input_data = sys.stdin.buffer.read().decode(coding, errors='ignore')
+
+output_data = cyrillic.Recoder().fix_common(input_data)
+
+if pyver == 2:
+    sys.stdout.write(output_data.encode(coding, errors='ignore'))
+elif pyver == 3:
+    if len(sys.argv) == 1:
+        sys.stdout.write(output_data)
+    else:
+        sys.stdout.buffer.write(output_data.encode(coding, errors='ignore'))
diff --git a/rurecoder/base_recoder.py b/rurecoder/base_recoder.py
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import with_statement
+
+from .pyver import *
+import json
+
+__all__ = ['BaseRecoder']
+
+html_parser = HTMLParser.HTMLParser()
+
+class BaseRecoder(object):
+    base_dir = None
+    file_3grams = None
+    file_plus_words = None
+    codings = None
+
+    funcs = [
+        [lambda *args, **kwargs: kwargs['text'].encode(kwargs['coding'], errors=kwargs['errors']), True, (unicode_type,)],
+        [lambda *args, **kwargs: kwargs['text'].decode(kwargs['coding'], errors=kwargs['errors']), True, (encoded_type,)],
+        [lambda *args, **kwargs: unquote_plus(kwargs['text']), False, (encoded_type,)],
+        [lambda *args, **kwargs: unquote_plus(kwargs['text'].replace('=', '%')), False, (encoded_type,)],
+        [lambda *args, **kwargs: html_parser.unescape(kwargs['text']), False, (unicode_type,)],
+    ]
+
+    regular_error_classes = (
+        UnicodeError,
+        AttributeError,  # for py3 support
+    )
+
+    def __init__(self, depth=2, errors='ignore', use_plus_words=False):
+        self.depth = depth
+        self.errors = errors
+        self.use_plus_words = use_plus_words
+        self.last_transform = None
+
+        with open(self.file_3grams) as f:
+            self.grams = json.load(f)
+
+        with open(self.file_plus_words) as f:
+            self.plus_words = set(json.load(f))
+
+    def _contains_plus_word(self, text):
+        for word in self.plus_words:
+            if (' ' + word + ' ') in text:
+                return True
+        return False
+
+    def _iter(self, text, depth, transform=lambda _text: _text):
+        if depth <= 0:
+            raise StopIteration
+        for func, coding_dependent, allowed_types in self.funcs:
+            if not isinstance(text, allowed_types): continue
+            for coding in (self.codings if coding_dependent else ['fake_coding']):
+                try:
+                    fixed_text = func(text=text, coding=coding, errors=self.errors)
+                    new_transform = lambda _text: func(text=transform(_text), coding=coding, errors=self.errors)
+                    yield fixed_text, new_transform
+                    for sub_fixed_text, sub_new_transform in self._iter(fixed_text, depth - 1, new_transform):
+                        yield sub_fixed_text, sub_new_transform
+                except self.regular_error_classes:
+                    pass
+
+    def _calc_weight(self, text):
+        weight = 0.0
+        count = 0
+
+        for i in range_iterator(len(text) - 2):
+            gram = text[i:i+3]
+            weight += self.grams.get(gram, 0.0)
+            count += 1
+        return (weight / count) if count else 0.0
+
+    def fix(self, unicode_text):
+        max_weight = self._calc_weight(unicode_text.lower())
+        max_text = unicode_text
+        for fixed_text, transform in self._iter(unicode_text, self.depth):
+            if not isinstance(fixed_text, unicode_type):
+                continue
+            fixed_text = fixed_text.lower()
+            weight = self._calc_weight(fixed_text)
+            if weight > max_weight and (not self.use_plus_words or self._contains_plus_word(fixed_text)):
+                max_weight = weight
+                max_text = transform(unicode_text)
+                self.last_transform = transform
+        return max_text
+
+    def fix_common(self, unicode_text):
+        max_weight = self._calc_weight(unicode_text.lower())
+        max_text = unicode_text
+
+        for ce in self.codings:
+            for cd in self.codings:
+                if ce == cd: continue
+                try:
+                    fixed_text = unicode_text.encode(ce, errors=self.errors).decode(cd, errors=self.errors).lower()
+                    weight = self._calc_weight(fixed_text)
+                    if weight > max_weight and (not self.use_plus_words or self._contains_plus_word(fixed_text)):
+                        max_weight = weight
+                        max_text = unicode_text.encode(ce, errors=self.errors).decode(cd, errors=self.errors)
+                        self.last_transform = lambda text: text.encode(ce, errors=self.errors).decode(cd, errors=self.errors)
+                except self.regular_error_classes:
+                    pass
+        return max_text
diff --git a/rurecoder/builder/__init__.py b/rurecoder/builder/__init__.py
diff --git a/rurecoder/builder/__main__.py b/rurecoder/builder/__main__.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import with_statement
+
+import sys
+import json
+from .builder import Builder
+from ..pyver import *
+
+coding = sys.argv[1] if len(sys.argv) > 1 else 'utf-8'
+
+if pyver == 2:
+    input_data = sys.stdin.read().decode(coding, errors='ignore')
+elif pyver == 3:
+    if len(sys.argv) == 1:
+        input_data = sys.stdin.read()
+    else:
+        input_data = sys.stdin.buffer.read().decode(coding, errors='ignore')
+
+builder = Builder()
+
+with open('3grams.json', 'w') as f:
+    grams = builder.build_grams(input_data)
+    json.dump(grams, f, indent=4)
+
+with open('plus_words.json', 'w') as f:
+    plus_words = builder.build_plus_words(input_data)
+    json.dump(plus_words, f, indent=4)
diff --git a/rurecoder/builder/builder.py b/rurecoder/builder/builder.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+import re
+from ..pyver import *
+
+class Builder(object):
+    def __init__(self, plus_words_count=30, grams_count=10000):
+        self.plus_words_count = plus_words_count
+        self.grams_count = grams_count
+
+    def build_grams(self, unicode_text):
+        unicode_text = unicode_text.lower()
+        grams = {}
+        for i in range_iterator(len(unicode_text) - 2):
+            gram = unicode_text[i:i+3]
+            grams[gram] = grams.get(gram, 0) + 1
+        top_grams = sorted(grams.items(), key=lambda item: item[1], reverse=True)[:self.grams_count]
+        return dict(top_grams)
+
+    def build_plus_words(self, unicode_text):
+        unicode_text = unicode_text.lower()
+        filtered_text = ''.join(map(lambda c: (c if c.isalpha() else ' '), unicode_text)).strip()
+        words = re.split('\s+', filtered_text)
+
+        plus_words = {}
+        for word in words:
+            plus_words[word] = plus_words.get(word, 0) + 1
+        top_words = sorted(plus_words.items(), key=lambda item: item[1], reverse=True)[:self.plus_words_count]
+        return list(dict(top_words).keys())
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		include README.rst *.py
		recursive-include rurecoder .py .json