Merge pull request #38 from JohnSnowLabs/1.1.3rc1

Intent and Action Classification, analyze Chinese News and the Crypto market, train a classifier that understands 100+ languages, translate between 200 + languages, answer questions, summarize text, and much more in NLU 1.1.3
JohnSnowLabs · Feb 28, 2021 · 8bd84ce · 8bd84ce
2 parents a223eee + df3da0d
commit 8bd84ce
Show file tree

Hide file tree

Showing 17 changed files with 403 additions and 69 deletions.
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
@@ -15,7 +15,7 @@ header:
   - title: '<span style="color: #FF8A00;"><i class="fab fa-github fa-2x"></i></span>'
     url: https://github.com/JohnSnowLabs/nlu
   - title: '<span style="color: #FF8A00;"><i class="fab fa-slack-hash fa-2x"></i></span>'
-    url: https://join.slack.com/t/spark-nlp/shared_invite/enQtNjA4MTE2MDI1MDkxLWVjNWUzOGNlODg1Y2FkNGEzNDQ1NDJjMjc3Y2FkOGFmN2Q3ODIyZGVhMzU0NGM3NzRjNDkyZjZlZTQ0YzY1N2I
+    url: https://join.slack.com/t/spark-nlp/shared_invite/zt-lutct9gm-kuUazcyFKhuGY3_0AMkxqA
 docs-en:
   - title:     Getting Started
     children:

diff --git a/docs/backup.md b/docs/backup.md
@@ -14,7 +14,7 @@ article_header:
       url: https://github.com/johnsnowlabs/spark-nlp  
     - text: '<i class="fab fa-slack-hash"></i> Slack' 
       type: outline-theme-dark
-      url: https://join.slack.com/t/spark-nlp/shared_invite/enQtNjA4MTE2MDI1MDkxLWVjNWUzOGNlODg1Y2FkNGEzNDQ1NDJjMjc3Y2FkOGFmN2Q3ODIyZGVhMzU0NGM3NzRjNDkyZjZlZTQ0YzY1N2I    
+      url: https://join.slack.com/t/spark-nlp/shared_invite/zt-lutct9gm-kuUazcyFKhuGY3_0AMkxqA    
 
   height: 50vh
   theme: dark

diff --git a/docs/en/load_api.md b/docs/en/load_api.md
@@ -53,12 +53,11 @@ To configure your model or pipeline, first load a NLU component and use the prin
 The print outputs tell you at which index of the pipe_components attribute which NLU component is located.   
 Via  setters which are named according to the parameter values a model can be configured
 
-
 ```python
-#example for configuring the first element in the pipe
+# example for configuring the first element in the pipe
 pipe = nlu.load('en.sentiment.twitter')
 pipe.generate_class_metadata_table()
-document_assembler_model = pipe.pipe_components[0].model
+document_assembler_model = pipe.components[0].model
 document_assembler_model.setCleanupMode('inplace')
 ```
 

diff --git a/docs/en/release_notes.md b/docs/en/release_notes.md
diff --git a/nlu/__init__.py b/nlu/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '1.1.1'
+__version__ = '1.1.3'
 
 import sys
 
@@ -77,6 +77,7 @@ def check_python_version():
 from nlu.components.embedding import Embeddings
 from nlu.components.util import Util
 from nlu.components.utils.ner_to_chunk_converter import ner_to_chunk_converter
+from nlu.components.utils.sentence_embeddings.spark_nlp_sentence_embedding import SparkNLPSentenceEmbeddings
 
 # sentence
 from nlu.components.sentence_detectors.pragmatic_sentence_detector.sentence_detector import PragmaticSentenceDetector
@@ -159,7 +160,6 @@ def check_python_version():
 
 import os
 import sparknlp
-sparknlp.start()
 
 def read_nlu_info(path):
     f = open(os.path.join(path,'nlu_info.txt'), "r")
@@ -227,7 +227,10 @@ def enable_verbose():
     ch.setLevel(logging.INFO)
     logger.addHandler(ch)
 
-
+def is_spark_23_installed():
+    version = pyspark.version.__version__
+    if '2.3' == version[:3]: return True
+    return False
 def load(request ='from_disk', path=None,verbose=False,version_checks=True):
     '''
     Load either a prebuild pipeline or a set of components identified by a whitespace seperated list of components
@@ -239,7 +242,8 @@ def load(request ='from_disk', path=None,verbose=False,version_checks=True):
     '''
     gc.collect()
     # if version_checks : check_pyspark_install()
-    spark = sparknlp.start()
+
+    spark = sparknlp.start(spark23=is_spark_23_installed())
     spark.catalog.clearCache()
     spark_started = True
     if verbose:
@@ -295,9 +299,11 @@ def parse_language_from_nlu_ref(nlu_ref):
 
 def resolve_multi_lang_embed(language,sparknlp_reference):
     if language == 'ar' and 'glove' in sparknlp_reference : return 'arabic_w2v_cc_300d'
+    if language == 'ur' : return 'urduvec_140M_300d'
     else : return sparknlp_reference
 
 
+
 def get_default_component_of_type(missing_component_type,language='en'):
     '''
     This function returns a default component for a missing component type.
@@ -327,11 +333,11 @@ def get_default_component_of_type(missing_component_type,language='en'):
         if missing_component_type == 'ner_converter': return Util('ner_converter')
 
     else:
-        multi_lang =['ar']
+        multi_lang =['ar','ur']
         # if there is an @ in the name, we must get some specific pretrained model from the sparknlp reference that should follow after the @
         missing_component_type, sparknlp_reference = missing_component_type.split('@')
         if 'embed' in missing_component_type:
-            # TODO RESOLVE MULTI LANG EMBEDS
+
             if language in multi_lang : sparknlp_reference = resolve_multi_lang_embed(language,sparknlp_reference)
             return construct_component_from_identifier(language=language, component_type='embed',
                                                        nlp_ref=sparknlp_reference)

diff --git a/nlu/components/util.py b/nlu/components/util.py
@@ -1,4 +1,4 @@
-from nlu.pipe_components import SparkNLUComponent, NLUComponent
+from nlu.pipe_components import SparkNLUComponent
 
 class Util(SparkNLUComponent):
 
@@ -20,4 +20,7 @@ def __init__(self, annotator_class='document_assembler', component_type='util',
             elif annotator_class == 'ner_to_chunk_converter' :
                 from nlu import NerToChunkConverter
                 self.model =  NerToChunkConverter.get_default_model()
+            elif annotator_class == 'sentence_embeddings':
+                from nlu import SparkNLPSentenceEmbeddings
+                self.model = SparkNLPSentenceEmbeddings.get_default_model()
         SparkNLUComponent.__init__(self, annotator_class, component_type)
diff --git a/...ents/utils/sentence_embedding/__init__.py → ...nts/utils/sentence_embeddings/__init__.py b/...ents/utils/sentence_embedding/__init__.py → ...nts/utils/sentence_embeddings/__init__.py
diff --git a/...s/sentence_embedding/component_infos.json → .../sentence_embeddings/component_infos.json b/...s/sentence_embedding/component_infos.json → .../sentence_embeddings/component_infos.json
@@ -8,13 +8,13 @@
   ],
   "inputs": [
     "document",
-    "embeddings"
+    "word_embeddings"
   ],
   "type": "sentence_embeddings",
   "file_dependencies": {},
   "spark_input_column_names": [
     "document",
-    "embeddings"
+    "word_embeddings"
   ],
   "spark_output_column_names": [
     "sentence_embeddings"

diff --git a/...embedding/spark_nlp_sentence_embedding.py → ...mbeddings/spark_nlp_sentence_embedding.py b/...embedding/spark_nlp_sentence_embedding.py → ...mbeddings/spark_nlp_sentence_embedding.py
@@ -2,10 +2,10 @@
 import sparknlp
 from sparknlp.annotator import *
 
-class SparkNLPSentenceEmbeddinge:
+class SparkNLPSentenceEmbeddings:
     @staticmethod
     def get_default_model():
         return  SentenceEmbeddings() \
-            .setInputCols(["document", "embeddings"]) \
+            .setInputCols(["document", "word_embeddings"]) \
             .setOutputCol("sentence_embeddings") \
             .setPoolingStrategy("AVERAGE")
diff --git a/nlu/namespace.py b/nlu/namespace.py
@@ -241,7 +241,6 @@ class NameSpace():
 
 
         # 2.7.0 new aliases
-        't5': ('t5_base','model'),
         't5.summarize': ('t5_base','model'),
         't5.classify.grammar_correctness': ('t5_base','model'),
         't5.classify.sentiment': ('t5_base','model'),
@@ -922,8 +921,7 @@ class NameSpace():
             'en.classify.cyberbullying': 'classifierdl_use_cyberbullying',  # Alias withouth embedding
             'en.classify.sarcasm': 'classifierdl_use_sarcasm',  # Alias withouth embedding
             'en.sentiment.twitter': 'sentimentdl_use_twitter',  # Alias withouth embedding
-            'en.sentiment.imdb': 'sentimentdl_glove_imdb',  # Default sentiment imdb with embeddigns glvoe
-
+
             #2.6 Release models
             'en.yake' :'yake',
 
@@ -1036,7 +1034,9 @@ class NameSpace():
             "en.ner.airline":"nerdl_atis_840b_300d",
             "en.ner.aspect.airline":"nerdl_atis_840b_300d",
             "en.ner.aspect.atis":"nerdl_atis_840b_300d",
-
+            # 2.7.3 snips
+            'en.classify.snips' : 'nerdl_snips_100d',
+            'en.ner.snips'      : 'classifierdl_use_snips'
 
 
         },
@@ -1180,6 +1180,15 @@ class NameSpace():
             'bn.ner':'ner_jifs_glove_840B_300d',
             'bn.ner.glove':'ner_jifs_glove_840B_300d',
 
+            'bn.ner.cc_300d' : 'bengaliner_cc_300d',
+
+            'bn.embed.cc_300d':'bengali_cc_300d',
+
+
+            'bn.embed':'bengali_cc_300d',
+            'bn.embed.glove':'bengali_cc_300d',
+
+
     },
         'br': {
             'br.stopwords': 'stopwords_br',
@@ -1453,6 +1462,7 @@ class NameSpace():
 
             'ur.sentiment' : 'sentimentdl_urduvec_imdb',
             'ur.embed' : 'urduvec_140M_300d' , # default ur embeds
+            'ur.embed.glove.300d' : 'urduvec_140M_300d' , # default ur embeds
             'ur.embed.urdu_vec_140M_300d' : 'urduvec_140M_300d' ,
             'ur.ner' : 'uner_mk_140M_300d' ,
             'ur.ner.mk_140M_300d' : 'uner_mk_140M_300d' ,

diff --git a/nlu/pipeline.py b/nlu/pipeline.py
@@ -38,7 +38,7 @@ def __init__(self):
         self.output_different_levels = True
         self.light_pipe_configured = False
         self.spark_non_light_transformer_pipe = None
-        self.pipe_components = []  # orderd list of nlu_component objects
+        self.components = []  # orderd list of nlu_component objects
         self.output_datatype = 'pandas'  # What data type should be returned after predict either spark, pandas, modin, numpy, string or array
         self.lang = 'en'
     def isInstanceOfNlpClassifer(self, model):
@@ -70,7 +70,7 @@ def configure_outputs(self, component, nlu_reference):
         i = 0
         while can_use_name == False:
             can_use_name = True
-            for c in self.pipe_components:
+            for c in self.components:
                 if new_output_name in c.component_info.spark_input_column_names + c.component_info.spark_output_column_names and c.component_info.name != component.component_info.name:
                     can_use_name = False
         if can_use_name == False:
@@ -89,7 +89,7 @@ def add(self, component, nlu_reference="default_name", pretrained_pipe_component
         :return:
         '''
         self.nlu_reference = nlu_reference
-        self.pipe_components.append(component)
+        self.components.append(component)
         # ensure that input/output cols are properly set
         component.__set_missing_model_attributes__()
         # Spark NLP model reference shortcut
@@ -223,7 +223,7 @@ def fit(self, dataset=None, dataset_path=None, label_seperator=','):
         '''
         self.is_fitted = True
         stages = []
-        for component in self.pipe_components:
+        for component in self.components:
             stages.append(component.model)
         self.spark_estimator_pipe = Pipeline(stages=stages)
 
@@ -276,7 +276,7 @@ def get_output_level_of_embeddings_provider(self, field_type, field_name):
         '''
         # find the component. Column output name should be unique
         component_inputs = []
-        for component in self.pipe_components:
+        for component in self.components:
             if field_name == component.component_info.name:
                 component_inputs = component.component_info.spark_input_column_names
 
@@ -286,7 +286,7 @@ def get_output_level_of_embeddings_provider(self, field_type, field_name):
             if 'embed' in input_name: target_output_component = input_name
 
         # get the model that outputs that feature
-        for component in self.pipe_components:
+        for component in self.components:
             component_outputs = component.component_info.spark_output_column_names
             for input_name in component_outputs:
                 if target_output_component == input_name:
@@ -737,7 +737,7 @@ def resolve_input_dependent_component_to_output_level(self, component):
 
         # (2.) A classifier, which is using sentence/doc embeddings.
         # We iterate over the pipe and check which Embed component is feeding the classifier and what the input that embed annotator is (sent or doc)
-        for c in self.pipe_components:
+        for c in self.components:
             # check if c is of sentence embedding class  which is always input dependent
             if any ( isinstance(c.model, e ) for e in self.all_embeddings['input_dependent']  ) :
                 if 'document' in c.component_info.spark_input_column_names :  return 'document'
@@ -776,7 +776,7 @@ def infer_and_set_output_level(self):
         bad_types = [ 'util','document','sentence']
         bad_names = ['token']
 
-        for c in self.pipe_components[::-1]:
+        for c in self.components[::-1]:
             if any (t in  c.component_info.type for t in bad_types) : continue
             if any (n in  c.component_info.name for n in bad_names) : continue
             self.output_level = self.resolve_component_to_output_level(c)
@@ -792,7 +792,7 @@ def get_chunk_col_name(self):
         :return: Name of the chunk type column in the dataset
         '''
 
-        for component in self.pipe_components:
+        for component in self.components:
             if component.component_info.output_level == 'chunk':
                 # Usually al chunk components ahve only one output and that is the cunk col so we can safely just pass the first element of the output list to the caller
                 logger.info("Detected %s as chunk output column for later zipping", component.component_info.name)
@@ -806,7 +806,7 @@ def resolve_field_to_output_level(self, field,f_type):
         :return: The output level of the field
         '''
         target = field.split('.')[0]
-        for c in self.pipe_components:
+        for c in self.components:
             if target in c.component_info.spark_output_column_names:
                 # MultiClassifier outputs should never be at same output level as pipe, returning special_case takes care of this
                 if isinstance(c.model, (MultiClassifierDLModel, MultiClassifierDLApproach,YakeModel)): return "multi_level"
@@ -1090,7 +1090,7 @@ def check_if_sentence_level_requirements_met(self):
         :return:
         '''
 
-        for c in self.pipe_components:
+        for c in self.components:
             if 'sentence' in c.component_info.spark_output_column_names : return True
         return False
 
@@ -1215,10 +1215,10 @@ def predict(self, data, output_level='', positions=False, keep_stranger_features
         if output_level == 'chunk':
             # If no chunk output component in pipe we must add it and run the query PipelineQueryVerifier again
             chunk_provided = False
-            for component in self.pipe_components:
+            for component in self.components:
                 if component.component_info.output_level == 'chunk': chunk_provided = True
             if chunk_provided == False:
-                self.pipe_components.append(nlu.get_default_component_of_type('chunk'))
+                self.components.append(nlu.get_default_component_of_type('chunk'))
                 # this could break indexing..
 
                 self = nlu.pipeline_logic.PipelineQueryVerifier.check_and_fix_nlu_pipeline(self)
@@ -1402,7 +1402,7 @@ def predict(self, data, output_level='', positions=False, keep_stranger_features
             fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
             print(exc_type, fname, exc_tb.tb_lineno)
             print(
-                'Stuck? Contact us on Slack! https://join.slack.com/t/spark-nlp/shared_invite/zt-j5ttxh0z-Fn3lQSG1Z0KpOs_SRxjdyw0196BQCDPY')
+                'Stuck? Contact us on Slack! https://join.slack.com/t/spark-nlp/shared_invite/zt-lutct9gm-kuUazcyFKhuGY3_0AMkxqA')
             if verbose :
                 err = sys.exc_info()[1]
                 print(str(err))