-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a23516a
commit 994eb2f
Showing
18 changed files
with
18,518 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
.idea/ | ||
*.pyc | ||
*.egg-info | ||
*.swp | ||
PKG-INFO | ||
/dist | ||
/build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
rurecoder | ||
======= | ||
|
||
Changelog | ||
--------- | ||
|
||
v0.1.0 | ||
- Реалиизация базовой функциональности. | ||
|
||
v0.2.0 | ||
- Добавлние декодеров. Теперь умеет декодить такие кракозябры (взял примеры на 2cyr.com): | ||
- - èðèëèöà | ||
- - %D0%A2%D0%BE%D0%B2%D0%B0+%D0%B5+%D0%BA | ||
- - åäíà ãîäè | ||
- - ирилица | ||
|
||
v0.3.0 | ||
- Добавлена поддержка python3. | ||
|
||
v0.3.1 | ||
- Удаление из зависимостей пакета regex. | ||
- Минорные фиксы. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
include README.rst *.py | ||
recursive-include rurecoder *.py *.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
rurecoder | ||
======= | ||
|
||
|
||
Назначение | ||
---------- | ||
|
||
Пакет нужен чтобы чинить "кракозябры" (или "краказябры") в читаемый текст. Например: "õîğîøèé òåêñò" => "хооший текст". | ||
|
||
|
||
Установка | ||
--------- | ||
:: | ||
|
||
$ git clone https://github.com/zvezdochiot/python-rurecoder #(based https://bitbucket.org/dkuryakin/recoder.git) | ||
$ cd python-rurecoder && python setup.py install | ||
|
||
или | ||
:: | ||
|
||
$ pip install rurecoder | ||
|
||
Полезные команды | ||
---------------- | ||
|
||
Использование как консольная тулза. | ||
:: | ||
|
||
$ echo "Îñíîâíàÿ Îëèìïèéñêàÿ äåðåâíÿ â" | python -mrecoder [coding] | ||
|
||
По умолчанию, coding=utf-8. | ||
|
||
Использование в коде | ||
-------------------- | ||
|
||
Чаще всего с кракозябрами справится такой базовый пример: | ||
|
||
.. code-block:: python | ||
from rurecoder.cyrillic import Recoder | ||
rec = Recoder() | ||
broken_text = u'Îñíîâíàÿ Îëèìïèéñêàÿ äåðåâíÿ â' | ||
fixed_text = rec.fix_common(broken_text) | ||
print fixed_text.encode('utf-8') | ||
Если базовый пример не справился, можно поиграться с настройками: | ||
|
||
.. code-block:: python | ||
from rurecoder.cyrillic import Recoder | ||
rec = Recoder(depth=4) | ||
broken_text = u'...' | ||
fixed_text = rec.fix(broken_text) # fix работает дольше и сложнее чем fix_common | ||
... | ||
Можно использовать частоупотребимые слова (и, на, к, в, ...) как индикатор успеха перекодировки. Но в этом случае текст починится только если в нём есть эти слова: | ||
|
||
.. code-block:: python | ||
from rurecoder.cyrillic import Recoder | ||
rec = Recoder(use_plus_words=True) | ||
... | ||
Замечания | ||
--------- | ||
|
||
В данный момент поддерживается только кириллица. | ||
|
||
Расширение | ||
---------- | ||
|
||
Если хочется расширить библиотеку не только кириллицей, предусмотренна удобная тулза: | ||
:: | ||
|
||
$ cat some_learning_text.txt | python -mrurecoder.builder [coding] | ||
|
||
По-умолчанию, coding=utf-8. На stdin подавать текстовку для обучения. На выходе получится 2 файлика: 3grams.json и plus_words.json. Далее всё делается по аналогии с recoder.cyrillic. | ||
|
||
Тесты | ||
----- | ||
|
||
Тут всё просто: | ||
:: | ||
|
||
$ git clone https://github.com/zvezdochiot/python-rurecoder #(based https://bitbucket.org/dkuryakin/recoder.git) | ||
$ cd python-rurecoder && python setup.py test | ||
|
||
See also CHANGELOG.rst |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#!/usr/bin/env python | ||
|
||
import sys, os | ||
|
||
# Try to detect where it is run from and set prefix and the search path. | ||
# It is assumed that the user installed cpuset using the --prefix= option | ||
prefix, bin = os.path.split(sys.path[0]) | ||
|
||
if bin == 'bin' and prefix != sys.prefix: | ||
sys.prefix = prefix | ||
sys.exec_prefix = prefix | ||
|
||
major, minor = sys.version_info[0:2] | ||
local_path = [os.path.join(prefix, 'lib', 'python'), | ||
os.path.join(prefix, 'lib', 'python%s.%s' % (major, minor)), | ||
os.path.join(prefix, 'lib', 'python%s.%s' % (major, minor), | ||
'site-packages')] | ||
sys.path = local_path + sys.path | ||
|
||
from rurecoder.__main__ import main | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from __future__ import absolute_import | ||
|
||
from . import tests | ||
from . import cyrillic | ||
from . import base_recoder |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from __future__ import unicode_literals | ||
|
||
import sys | ||
from . import cyrillic | ||
from .pyver import * | ||
|
||
coding = sys.argv[1] if len(sys.argv) > 1 else 'utf-8' | ||
|
||
if pyver == 2: | ||
input_data = sys.stdin.read().decode(coding, errors='ignore') | ||
elif pyver == 3: | ||
if len(sys.argv) == 1: | ||
input_data = sys.stdin.read() | ||
else: | ||
input_data = sys.stdin.buffer.read().decode(coding, errors='ignore') | ||
|
||
output_data = cyrillic.Recoder().fix_common(input_data) | ||
|
||
if pyver == 2: | ||
sys.stdout.write(output_data.encode(coding, errors='ignore')) | ||
elif pyver == 3: | ||
if len(sys.argv) == 1: | ||
sys.stdout.write(output_data) | ||
else: | ||
sys.stdout.buffer.write(output_data.encode(coding, errors='ignore')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from __future__ import with_statement | ||
|
||
from .pyver import * | ||
import json | ||
|
||
__all__ = ['BaseRecoder'] | ||
|
||
html_parser = HTMLParser.HTMLParser() | ||
|
||
class BaseRecoder(object): | ||
base_dir = None | ||
file_3grams = None | ||
file_plus_words = None | ||
codings = None | ||
|
||
funcs = [ | ||
[lambda *args, **kwargs: kwargs['text'].encode(kwargs['coding'], errors=kwargs['errors']), True, (unicode_type,)], | ||
[lambda *args, **kwargs: kwargs['text'].decode(kwargs['coding'], errors=kwargs['errors']), True, (encoded_type,)], | ||
[lambda *args, **kwargs: unquote_plus(kwargs['text']), False, (encoded_type,)], | ||
[lambda *args, **kwargs: unquote_plus(kwargs['text'].replace('=', '%')), False, (encoded_type,)], | ||
[lambda *args, **kwargs: html_parser.unescape(kwargs['text']), False, (unicode_type,)], | ||
] | ||
|
||
regular_error_classes = ( | ||
UnicodeError, | ||
AttributeError, # for py3 support | ||
) | ||
|
||
def __init__(self, depth=2, errors='ignore', use_plus_words=False): | ||
self.depth = depth | ||
self.errors = errors | ||
self.use_plus_words = use_plus_words | ||
self.last_transform = None | ||
|
||
with open(self.file_3grams) as f: | ||
self.grams = json.load(f) | ||
|
||
with open(self.file_plus_words) as f: | ||
self.plus_words = set(json.load(f)) | ||
|
||
def _contains_plus_word(self, text): | ||
for word in self.plus_words: | ||
if (' ' + word + ' ') in text: | ||
return True | ||
return False | ||
|
||
def _iter(self, text, depth, transform=lambda _text: _text): | ||
if depth <= 0: | ||
raise StopIteration | ||
for func, coding_dependent, allowed_types in self.funcs: | ||
if not isinstance(text, allowed_types): continue | ||
for coding in (self.codings if coding_dependent else ['fake_coding']): | ||
try: | ||
fixed_text = func(text=text, coding=coding, errors=self.errors) | ||
new_transform = lambda _text: func(text=transform(_text), coding=coding, errors=self.errors) | ||
yield fixed_text, new_transform | ||
for sub_fixed_text, sub_new_transform in self._iter(fixed_text, depth - 1, new_transform): | ||
yield sub_fixed_text, sub_new_transform | ||
except self.regular_error_classes: | ||
pass | ||
|
||
def _calc_weight(self, text): | ||
weight = 0.0 | ||
count = 0 | ||
|
||
for i in range_iterator(len(text) - 2): | ||
gram = text[i:i+3] | ||
weight += self.grams.get(gram, 0.0) | ||
count += 1 | ||
return (weight / count) if count else 0.0 | ||
|
||
def fix(self, unicode_text): | ||
max_weight = self._calc_weight(unicode_text.lower()) | ||
max_text = unicode_text | ||
for fixed_text, transform in self._iter(unicode_text, self.depth): | ||
if not isinstance(fixed_text, unicode_type): | ||
continue | ||
fixed_text = fixed_text.lower() | ||
weight = self._calc_weight(fixed_text) | ||
if weight > max_weight and (not self.use_plus_words or self._contains_plus_word(fixed_text)): | ||
max_weight = weight | ||
max_text = transform(unicode_text) | ||
self.last_transform = transform | ||
return max_text | ||
|
||
def fix_common(self, unicode_text): | ||
max_weight = self._calc_weight(unicode_text.lower()) | ||
max_text = unicode_text | ||
|
||
for ce in self.codings: | ||
for cd in self.codings: | ||
if ce == cd: continue | ||
try: | ||
fixed_text = unicode_text.encode(ce, errors=self.errors).decode(cd, errors=self.errors).lower() | ||
weight = self._calc_weight(fixed_text) | ||
if weight > max_weight and (not self.use_plus_words or self._contains_plus_word(fixed_text)): | ||
max_weight = weight | ||
max_text = unicode_text.encode(ce, errors=self.errors).decode(cd, errors=self.errors) | ||
self.last_transform = lambda text: text.encode(ce, errors=self.errors).decode(cd, errors=self.errors) | ||
except self.regular_error_classes: | ||
pass | ||
return max_text |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from __future__ import with_statement | ||
|
||
import sys | ||
import json | ||
from .builder import Builder | ||
from ..pyver import * | ||
|
||
coding = sys.argv[1] if len(sys.argv) > 1 else 'utf-8' | ||
|
||
if pyver == 2: | ||
input_data = sys.stdin.read().decode(coding, errors='ignore') | ||
elif pyver == 3: | ||
if len(sys.argv) == 1: | ||
input_data = sys.stdin.read() | ||
else: | ||
input_data = sys.stdin.buffer.read().decode(coding, errors='ignore') | ||
|
||
builder = Builder() | ||
|
||
with open('3grams.json', 'w') as f: | ||
grams = builder.build_grams(input_data) | ||
json.dump(grams, f, indent=4) | ||
|
||
with open('plus_words.json', 'w') as f: | ||
plus_words = builder.build_plus_words(input_data) | ||
json.dump(plus_words, f, indent=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from __future__ import unicode_literals | ||
|
||
import re | ||
from ..pyver import * | ||
|
||
class Builder(object): | ||
def __init__(self, plus_words_count=30, grams_count=10000): | ||
self.plus_words_count = plus_words_count | ||
self.grams_count = grams_count | ||
|
||
def build_grams(self, unicode_text): | ||
unicode_text = unicode_text.lower() | ||
grams = {} | ||
for i in range_iterator(len(unicode_text) - 2): | ||
gram = unicode_text[i:i+3] | ||
grams[gram] = grams.get(gram, 0) + 1 | ||
top_grams = sorted(grams.items(), key=lambda item: item[1], reverse=True)[:self.grams_count] | ||
return dict(top_grams) | ||
|
||
def build_plus_words(self, unicode_text): | ||
unicode_text = unicode_text.lower() | ||
filtered_text = ''.join(map(lambda c: (c if c.isalpha() else ' '), unicode_text)).strip() | ||
words = re.split('\s+', filtered_text) | ||
|
||
plus_words = {} | ||
for word in words: | ||
plus_words[word] = plus_words.get(word, 0) + 1 | ||
top_words = sorted(plus_words.items(), key=lambda item: item[1], reverse=True)[:self.plus_words_count] | ||
return list(dict(top_words).keys()) |
Oops, something went wrong.