Skip to content

Commit

Permalink
Merge pull request #38 from JohnSnowLabs/1.1.3rc1
Browse files Browse the repository at this point in the history
 Intent and Action Classification,  analyze Chinese News and the Crypto market, train a classifier that understands 100+ languages, translate between 200 + languages, answer questions, summarize text, and much more in NLU 1.1.3
  • Loading branch information
C-K-Loan authored Feb 28, 2021
2 parents a223eee + df3da0d commit 8bd84ce
Show file tree
Hide file tree
Showing 17 changed files with 403 additions and 69 deletions.
2 changes: 1 addition & 1 deletion docs/_data/navigation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ header:
- title: '<span style="color: #FF8A00;"><i class="fab fa-github fa-2x"></i></span>'
url: https://github.com/JohnSnowLabs/nlu
- title: '<span style="color: #FF8A00;"><i class="fab fa-slack-hash fa-2x"></i></span>'
url: https://join.slack.com/t/spark-nlp/shared_invite/enQtNjA4MTE2MDI1MDkxLWVjNWUzOGNlODg1Y2FkNGEzNDQ1NDJjMjc3Y2FkOGFmN2Q3ODIyZGVhMzU0NGM3NzRjNDkyZjZlZTQ0YzY1N2I
url: https://join.slack.com/t/spark-nlp/shared_invite/zt-lutct9gm-kuUazcyFKhuGY3_0AMkxqA
docs-en:
- title: Getting Started
children:
Expand Down
2 changes: 1 addition & 1 deletion docs/backup.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ article_header:
url: https://github.com/johnsnowlabs/spark-nlp
- text: '<i class="fab fa-slack-hash"></i> Slack'
type: outline-theme-dark
url: https://join.slack.com/t/spark-nlp/shared_invite/enQtNjA4MTE2MDI1MDkxLWVjNWUzOGNlODg1Y2FkNGEzNDQ1NDJjMjc3Y2FkOGFmN2Q3ODIyZGVhMzU0NGM3NzRjNDkyZjZlZTQ0YzY1N2I
url: https://join.slack.com/t/spark-nlp/shared_invite/zt-lutct9gm-kuUazcyFKhuGY3_0AMkxqA

height: 50vh
theme: dark
Expand Down
5 changes: 2 additions & 3 deletions docs/en/load_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,11 @@ To configure your model or pipeline, first load a NLU component and use the prin
The print outputs tell you at which index of the pipe_components attribute which NLU component is located.
Via setters which are named according to the parameter values a model can be configured


```python
#example for configuring the first element in the pipe
# example for configuring the first element in the pipe
pipe = nlu.load('en.sentiment.twitter')
pipe.generate_class_metadata_table()
document_assembler_model = pipe.pipe_components[0].model
document_assembler_model = pipe.components[0].model
document_assembler_model.setCleanupMode('inplace')
```

Expand Down
182 changes: 178 additions & 4 deletions docs/en/release_notes.md

Large diffs are not rendered by default.

18 changes: 12 additions & 6 deletions nlu/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '1.1.1'
__version__ = '1.1.3'

import sys

Expand Down Expand Up @@ -77,6 +77,7 @@ def check_python_version():
from nlu.components.embedding import Embeddings
from nlu.components.util import Util
from nlu.components.utils.ner_to_chunk_converter import ner_to_chunk_converter
from nlu.components.utils.sentence_embeddings.spark_nlp_sentence_embedding import SparkNLPSentenceEmbeddings

# sentence
from nlu.components.sentence_detectors.pragmatic_sentence_detector.sentence_detector import PragmaticSentenceDetector
Expand Down Expand Up @@ -159,7 +160,6 @@ def check_python_version():

import os
import sparknlp
sparknlp.start()

def read_nlu_info(path):
f = open(os.path.join(path,'nlu_info.txt'), "r")
Expand Down Expand Up @@ -227,7 +227,10 @@ def enable_verbose():
ch.setLevel(logging.INFO)
logger.addHandler(ch)


def is_spark_23_installed():
version = pyspark.version.__version__
if '2.3' == version[:3]: return True
return False
def load(request ='from_disk', path=None,verbose=False,version_checks=True):
'''
Load either a prebuild pipeline or a set of components identified by a whitespace seperated list of components
Expand All @@ -239,7 +242,8 @@ def load(request ='from_disk', path=None,verbose=False,version_checks=True):
'''
gc.collect()
# if version_checks : check_pyspark_install()
spark = sparknlp.start()

spark = sparknlp.start(spark23=is_spark_23_installed())
spark.catalog.clearCache()
spark_started = True
if verbose:
Expand Down Expand Up @@ -295,9 +299,11 @@ def parse_language_from_nlu_ref(nlu_ref):

def resolve_multi_lang_embed(language,sparknlp_reference):
if language == 'ar' and 'glove' in sparknlp_reference : return 'arabic_w2v_cc_300d'
if language == 'ur' : return 'urduvec_140M_300d'
else : return sparknlp_reference



def get_default_component_of_type(missing_component_type,language='en'):
'''
This function returns a default component for a missing component type.
Expand Down Expand Up @@ -327,11 +333,11 @@ def get_default_component_of_type(missing_component_type,language='en'):
if missing_component_type == 'ner_converter': return Util('ner_converter')

else:
multi_lang =['ar']
multi_lang =['ar','ur']
# if there is an @ in the name, we must get some specific pretrained model from the sparknlp reference that should follow after the @
missing_component_type, sparknlp_reference = missing_component_type.split('@')
if 'embed' in missing_component_type:
# TODO RESOLVE MULTI LANG EMBEDS

if language in multi_lang : sparknlp_reference = resolve_multi_lang_embed(language,sparknlp_reference)
return construct_component_from_identifier(language=language, component_type='embed',
nlp_ref=sparknlp_reference)
Expand Down
5 changes: 4 additions & 1 deletion nlu/components/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from nlu.pipe_components import SparkNLUComponent, NLUComponent
from nlu.pipe_components import SparkNLUComponent

class Util(SparkNLUComponent):

Expand All @@ -20,4 +20,7 @@ def __init__(self, annotator_class='document_assembler', component_type='util',
elif annotator_class == 'ner_to_chunk_converter' :
from nlu import NerToChunkConverter
self.model = NerToChunkConverter.get_default_model()
elif annotator_class == 'sentence_embeddings':
from nlu import SparkNLPSentenceEmbeddings
self.model = SparkNLPSentenceEmbeddings.get_default_model()
SparkNLUComponent.__init__(self, annotator_class, component_type)
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
],
"inputs": [
"document",
"embeddings"
"word_embeddings"
],
"type": "sentence_embeddings",
"file_dependencies": {},
"spark_input_column_names": [
"document",
"embeddings"
"word_embeddings"
],
"spark_output_column_names": [
"sentence_embeddings"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import sparknlp
from sparknlp.annotator import *

class SparkNLPSentenceEmbeddinge:
class SparkNLPSentenceEmbeddings:
@staticmethod
def get_default_model():
return SentenceEmbeddings() \
.setInputCols(["document", "embeddings"]) \
.setInputCols(["document", "word_embeddings"]) \
.setOutputCol("sentence_embeddings") \
.setPoolingStrategy("AVERAGE")
18 changes: 14 additions & 4 deletions nlu/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,6 @@ class NameSpace():


# 2.7.0 new aliases
't5': ('t5_base','model'),
't5.summarize': ('t5_base','model'),
't5.classify.grammar_correctness': ('t5_base','model'),
't5.classify.sentiment': ('t5_base','model'),
Expand Down Expand Up @@ -922,8 +921,7 @@ class NameSpace():
'en.classify.cyberbullying': 'classifierdl_use_cyberbullying', # Alias withouth embedding
'en.classify.sarcasm': 'classifierdl_use_sarcasm', # Alias withouth embedding
'en.sentiment.twitter': 'sentimentdl_use_twitter', # Alias withouth embedding
'en.sentiment.imdb': 'sentimentdl_glove_imdb', # Default sentiment imdb with embeddigns glvoe


#2.6 Release models
'en.yake' :'yake',

Expand Down Expand Up @@ -1036,7 +1034,9 @@ class NameSpace():
"en.ner.airline":"nerdl_atis_840b_300d",
"en.ner.aspect.airline":"nerdl_atis_840b_300d",
"en.ner.aspect.atis":"nerdl_atis_840b_300d",

# 2.7.3 snips
'en.classify.snips' : 'nerdl_snips_100d',
'en.ner.snips' : 'classifierdl_use_snips'


},
Expand Down Expand Up @@ -1180,6 +1180,15 @@ class NameSpace():
'bn.ner':'ner_jifs_glove_840B_300d',
'bn.ner.glove':'ner_jifs_glove_840B_300d',

'bn.ner.cc_300d' : 'bengaliner_cc_300d',

'bn.embed.cc_300d':'bengali_cc_300d',


'bn.embed':'bengali_cc_300d',
'bn.embed.glove':'bengali_cc_300d',


},
'br': {
'br.stopwords': 'stopwords_br',
Expand Down Expand Up @@ -1453,6 +1462,7 @@ class NameSpace():

'ur.sentiment' : 'sentimentdl_urduvec_imdb',
'ur.embed' : 'urduvec_140M_300d' , # default ur embeds
'ur.embed.glove.300d' : 'urduvec_140M_300d' , # default ur embeds
'ur.embed.urdu_vec_140M_300d' : 'urduvec_140M_300d' ,
'ur.ner' : 'uner_mk_140M_300d' ,
'ur.ner.mk_140M_300d' : 'uner_mk_140M_300d' ,
Expand Down
28 changes: 14 additions & 14 deletions nlu/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self):
self.output_different_levels = True
self.light_pipe_configured = False
self.spark_non_light_transformer_pipe = None
self.pipe_components = [] # orderd list of nlu_component objects
self.components = [] # orderd list of nlu_component objects
self.output_datatype = 'pandas' # What data type should be returned after predict either spark, pandas, modin, numpy, string or array
self.lang = 'en'
def isInstanceOfNlpClassifer(self, model):
Expand Down Expand Up @@ -70,7 +70,7 @@ def configure_outputs(self, component, nlu_reference):
i = 0
while can_use_name == False:
can_use_name = True
for c in self.pipe_components:
for c in self.components:
if new_output_name in c.component_info.spark_input_column_names + c.component_info.spark_output_column_names and c.component_info.name != component.component_info.name:
can_use_name = False
if can_use_name == False:
Expand All @@ -89,7 +89,7 @@ def add(self, component, nlu_reference="default_name", pretrained_pipe_component
:return:
'''
self.nlu_reference = nlu_reference
self.pipe_components.append(component)
self.components.append(component)
# ensure that input/output cols are properly set
component.__set_missing_model_attributes__()
# Spark NLP model reference shortcut
Expand Down Expand Up @@ -223,7 +223,7 @@ def fit(self, dataset=None, dataset_path=None, label_seperator=','):
'''
self.is_fitted = True
stages = []
for component in self.pipe_components:
for component in self.components:
stages.append(component.model)
self.spark_estimator_pipe = Pipeline(stages=stages)

Expand Down Expand Up @@ -276,7 +276,7 @@ def get_output_level_of_embeddings_provider(self, field_type, field_name):
'''
# find the component. Column output name should be unique
component_inputs = []
for component in self.pipe_components:
for component in self.components:
if field_name == component.component_info.name:
component_inputs = component.component_info.spark_input_column_names

Expand All @@ -286,7 +286,7 @@ def get_output_level_of_embeddings_provider(self, field_type, field_name):
if 'embed' in input_name: target_output_component = input_name

# get the model that outputs that feature
for component in self.pipe_components:
for component in self.components:
component_outputs = component.component_info.spark_output_column_names
for input_name in component_outputs:
if target_output_component == input_name:
Expand Down Expand Up @@ -737,7 +737,7 @@ def resolve_input_dependent_component_to_output_level(self, component):

# (2.) A classifier, which is using sentence/doc embeddings.
# We iterate over the pipe and check which Embed component is feeding the classifier and what the input that embed annotator is (sent or doc)
for c in self.pipe_components:
for c in self.components:
# check if c is of sentence embedding class which is always input dependent
if any ( isinstance(c.model, e ) for e in self.all_embeddings['input_dependent'] ) :
if 'document' in c.component_info.spark_input_column_names : return 'document'
Expand Down Expand Up @@ -776,7 +776,7 @@ def infer_and_set_output_level(self):
bad_types = [ 'util','document','sentence']
bad_names = ['token']

for c in self.pipe_components[::-1]:
for c in self.components[::-1]:
if any (t in c.component_info.type for t in bad_types) : continue
if any (n in c.component_info.name for n in bad_names) : continue
self.output_level = self.resolve_component_to_output_level(c)
Expand All @@ -792,7 +792,7 @@ def get_chunk_col_name(self):
:return: Name of the chunk type column in the dataset
'''

for component in self.pipe_components:
for component in self.components:
if component.component_info.output_level == 'chunk':
# Usually al chunk components ahve only one output and that is the cunk col so we can safely just pass the first element of the output list to the caller
logger.info("Detected %s as chunk output column for later zipping", component.component_info.name)
Expand All @@ -806,7 +806,7 @@ def resolve_field_to_output_level(self, field,f_type):
:return: The output level of the field
'''
target = field.split('.')[0]
for c in self.pipe_components:
for c in self.components:
if target in c.component_info.spark_output_column_names:
# MultiClassifier outputs should never be at same output level as pipe, returning special_case takes care of this
if isinstance(c.model, (MultiClassifierDLModel, MultiClassifierDLApproach,YakeModel)): return "multi_level"
Expand Down Expand Up @@ -1090,7 +1090,7 @@ def check_if_sentence_level_requirements_met(self):
:return:
'''

for c in self.pipe_components:
for c in self.components:
if 'sentence' in c.component_info.spark_output_column_names : return True
return False

Expand Down Expand Up @@ -1215,10 +1215,10 @@ def predict(self, data, output_level='', positions=False, keep_stranger_features
if output_level == 'chunk':
# If no chunk output component in pipe we must add it and run the query PipelineQueryVerifier again
chunk_provided = False
for component in self.pipe_components:
for component in self.components:
if component.component_info.output_level == 'chunk': chunk_provided = True
if chunk_provided == False:
self.pipe_components.append(nlu.get_default_component_of_type('chunk'))
self.components.append(nlu.get_default_component_of_type('chunk'))
# this could break indexing..

self = nlu.pipeline_logic.PipelineQueryVerifier.check_and_fix_nlu_pipeline(self)
Expand Down Expand Up @@ -1402,7 +1402,7 @@ def predict(self, data, output_level='', positions=False, keep_stranger_features
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print(
'Stuck? Contact us on Slack! https://join.slack.com/t/spark-nlp/shared_invite/zt-j5ttxh0z-Fn3lQSG1Z0KpOs_SRxjdyw0196BQCDPY')
'Stuck? Contact us on Slack! https://join.slack.com/t/spark-nlp/shared_invite/zt-lutct9gm-kuUazcyFKhuGY3_0AMkxqA')
if verbose :
err = sys.exc_info()[1]
print(str(err))
Expand Down
Loading

0 comments on commit 8bd84ce

Please sign in to comment.