diff --git a/docs/geotext.md b/docs/geotext.md
index 0a37096..6fb09b9 100644
--- a/docs/geotext.md
+++ b/docs/geotext.md
@@ -1,6 +1,7 @@
# Table of Contents
* [flashgeotext.geotext](#flashgeotext.geotext)
+ * [GeoTextConfiguration](#flashgeotext.geotext.GeoTextConfiguration)
* [GeoText](#flashgeotext.geotext.GeoText)
* [\_\_init\_\_](#flashgeotext.geotext.GeoText.__init__)
* [extract](#flashgeotext.geotext.GeoText.extract)
@@ -8,6 +9,20 @@
# flashgeotext.geotext
+
+## GeoTextConfiguration Objects
+
+```python
+class GeoTextConfiguration(BaseModel)
+```
+
+GeoText configuration
+
+**Arguments**:
+
+- `use_demo_data` _bool_ - load demo data or not, default True
+- `case_sensitive` _bool_ - case sensitive lookup, default True
+
## GeoText Objects
@@ -29,7 +44,7 @@ span info.
```python
from flashgeotext.geotext import GeoText
- geotext = GeoText(use_demo_data=True)
+ geotext = GeoText()
input_text = '''Shanghai. The Chinese Ministry of Finance in Shanghai said that China plans
to cut tariffs on $75 billion worth of goods that the country
@@ -65,14 +80,14 @@ span info.
#### \_\_init\_\_
```python
- | __init__(use_demo_data: bool = True) -> None
+ | __init__(config: GeoTextConfiguration = GeoTextConfiguration().dict()) -> None
```
instantiate an empty LookupDataPool, optionally/by default with demo data
**Arguments**:
-- `use_demo_data` _bool_ - optionally use demo data, defaults to True.
+- `config` - GeoTextConfiguration = { use_demo_data: True, case_sensitive: True }.
#### extract
diff --git a/docs/lookup.md b/docs/lookup.md
index 9e5d6c1..69bf653 100644
--- a/docs/lookup.md
+++ b/docs/lookup.md
@@ -158,7 +158,7 @@ Collection of KeywordProcessors from LookupData
#### add
```python
- | add(lookup: LookupData, update: bool = False) -> None
+ | add(lookup: LookupData, update: bool = False, case_sensitive: bool = True) -> None
```
Add LookupData to LookupDataPool
diff --git a/flashgeotext/geotext.py b/flashgeotext/geotext.py
index 39fc722..cfb219e 100644
--- a/flashgeotext/geotext.py
+++ b/flashgeotext/geotext.py
@@ -1,7 +1,23 @@
+from typing import Optional
+
+from pydantic import BaseModel
+
from flashgeotext.lookup import LookupDataPool
from flashgeotext.lookup import MissingLookupDataError
+class GeoTextConfiguration(BaseModel):
+ """GeoText configuration
+
+ Args:
+ use_demo_data (bool): load demo data or not, default True
+ case_sensitive (bool): case sensitive lookup, default True
+ """
+
+ use_demo_data: Optional[bool] = True
+ case_sensitive: Optional[bool] = True
+
+
class GeoText(LookupDataPool):
"""Extract LookupData from input text
@@ -16,7 +32,7 @@ class GeoText(LookupDataPool):
```python
from flashgeotext.geotext import GeoText
- geotext = GeoText(use_demo_data=True)
+ geotext = GeoText()
input_text = '''Shanghai. The Chinese Ministry of Finance in Shanghai said that China plans
to cut tariffs on $75 billion worth of goods that the country
@@ -50,21 +66,23 @@ class GeoText(LookupDataPool):
"""
- def __init__(self, use_demo_data: bool = True) -> None:
+ def __init__(
+ self, config: GeoTextConfiguration = GeoTextConfiguration().dict()
+ ) -> None:
""" instantiate an empty LookupDataPool, optionally/by default with demo data
Args:
- use_demo_data (bool): optionally use demo data, defaults to True.
+ config: GeoTextConfiguration = { use_demo_data: True, case_sensitive: True }.
"""
self.pool: dict = {}
- if use_demo_data:
- self._add_demo_data()
+ if config["use_demo_data"]:
+ self._add_demo_data(case_sensitive=config["case_sensitive"])
def extract(self, input_text: str, span_info: bool = True) -> dict:
"""Extract LookupData from an input_text
- Arguments:
+ Args:
input_text (str): String to extract LookupData from.
span_info (bool): Optionally, return span_info. Defaults to True.
@@ -94,7 +112,7 @@ def _parse_extract(self, extract_data: list, span_info: bool = True) -> dict:
Parse flashtext.KeywordProcessor.extract_keywords() output to count occurances,
and optionally span_info.
- Arguments:
+ Args:
extract_data (list): flashtext.KeywordProcessor.extract_keywords() return value
span_info (bool): optionally, parse span_info
diff --git a/flashgeotext/lookup.py b/flashgeotext/lookup.py
index dd85480..ad6dfb6 100644
--- a/flashgeotext/lookup.py
+++ b/flashgeotext/lookup.py
@@ -47,18 +47,6 @@ class LookupValidation:
error_count (int): Error count in validation data.
errors (dict):
- Example: {
- "Berlin": [
- "Berlin missing in list of synonyms",
- "data['Berlin'] is not a list of synonyms"
- ]
- }
-
- Arguments:
- status (str): Humanreadible string containing the Error status.
- error_count (int): Error count in validation data.
- errors (dict):
-
Example: {
"Berlin": [
"Berlin missing in list of synonyms",
@@ -161,7 +149,9 @@ class LookupDataPool:
def __init__(self) -> None:
self.pool: dict = {}
- def add(self, lookup: LookupData, update: bool = False) -> None:
+ def add(
+ self, lookup: LookupData, update: bool = False, case_sensitive: bool = True
+ ) -> None:
"""Add LookupData to LookupDataPool
Add LookupData to LookupDataPool.
@@ -170,17 +160,18 @@ def add(self, lookup: LookupData, update: bool = False) -> None:
Args:
lookup (LookupData): LookupData to add to pool
- update (bool): Allow update of an existing entry in LookupDataPool
+ update (bool): Allow update of an existing entry in LookupDataPool, default False
+ case_sensitive (bool): Allow case-sensitive lookup, default True
"""
if not isinstance(lookup, LookupData):
- raise TypeError(f"lookup has to be instance of LookupData")
+ raise TypeError("lookup has to be instance of LookupData")
if lookup.name in self.pool and not update:
raise LookupDuplicateError(
f"'{lookup.name}' has already been added. Set update=True to update"
)
else:
- self.pool[lookup.name] = KeywordProcessor(case_sensitive=True)
+ self.pool[lookup.name] = KeywordProcessor(case_sensitive=case_sensitive)
self.pool[lookup.name].add_keywords_from_dict(lookup.data)
# if there is a script specified, then update non word boundaries with
@@ -209,7 +200,7 @@ def remove_all(self):
self.pool = {}
- def _add_demo_data(self):
+ def _add_demo_data(self, case_sensitive: bool = True):
"""(private) Add demo data to pool
Adds DEMODATA_CITIES and DEMODATA_COUNTRIES to LookupDataPool
@@ -220,8 +211,8 @@ def _add_demo_data(self):
countries = LookupData(
name="countries", data=load_data_from_file(file=DEMODATA_COUNTRIES)
)
- self.add(cities)
- self.add(countries)
+ self.add(cities, case_sensitive=case_sensitive)
+ self.add(countries, case_sensitive=case_sensitive)
logger.debug(f"demo data loaded for: {list(self.pool.keys())}")
diff --git a/readme.md b/readme.md
index 125b11e..d356318 100644
--- a/readme.md
+++ b/readme.md
@@ -20,7 +20,7 @@ Extract and count countries and cities (+their synonyms) from text, like [GeoTex
```python
from flashgeotext.geotext import GeoText
-geotext = GeoText(use_demo_data=True)
+geotext = GeoText()
input_text = '''Shanghai. The Chinese Ministry of Finance in Shanghai said that China plans
to cut tariffs on $75 billion worth of goods that the country
diff --git a/tests/conftest.py b/tests/conftest.py
index 2e09666..f30d369 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,4 +18,4 @@ def test_data_countries():
@pytest.fixture
def geotext():
- return GeoText(use_demo_data=True)
+ return GeoText()
diff --git a/tests/integration/test_geotext_extractor.py b/tests/integration/test_geotext_extractor.py
index 0342f12..a949e35 100644
--- a/tests/integration/test_geotext_extractor.py
+++ b/tests/integration/test_geotext_extractor.py
@@ -8,7 +8,7 @@
def test_geotext_demo_data():
- geotext = GeoText(use_demo_data=True)
+ geotext = GeoText()
assert geotext.pool["cities"]
assert geotext.pool["countries"]
@@ -20,7 +20,7 @@ def test_geotext_extract(geotext):
def test_geotext_raises_on_empty_pool():
- output = GeoText(use_demo_data=False)
+ output = GeoText(config={"use_demo_data": False})
with pytest.raises(MissingLookupDataError):
output.extract(text)
@@ -39,6 +39,16 @@ def test_geotext_extract_with_count_span_info_false(geotext):
assert output["cities"]["Berlin"]["span_info"] == [(0, 6), (43, 49)]
+def test_geotext_case_sensitive_demo_data():
+ geotext = GeoText(config={"use_demo_data": True, "case_sensitive": False})
+ text = "berlin ist ne tolle stadt"
+ output = geotext.extract(input_text=text, span_info=True)
+
+ print(output)
+
+ assert output["cities"]["Berlin"]["span_info"] == [(0, 6)]
+
+
# tests used in geotext (https://github.com/elyase/geotext)
@@ -161,7 +171,7 @@ def test_geotext_with_script_added_to_non_word_boundaries():
cyrillic = LookupData(
name="test_1", data={"Нижневартовск": ["Нижневартовск"]}, script="cyrillic"
)
- geotext = GeoText(use_demo_data=False)
+ geotext = GeoText(config={"use_demo_data": False})
geotext.add(cyrillic)
text = """