all

hamiltonion · Oct 17, 2019 · 233d9ec · 233d9ec
commit 233d9ec
Show file tree

Hide file tree

Showing 132 changed files with 7,807 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,18 @@
+KBQA/actorName.txt 
+KBQA/movieName.txt
+*.json
+*.pyc
+semantic_search/elasticsearch/data/*.txt
+semantic_search/elasticsearch/data/attr_ac.pkl 
+ie/deepdive/udf/bazaar/*
+ie/deepdive/*.txt
+ie/deepdive/run/*
+ie/deepdive/*.csv
+*.txt
+*.csv
+ie/craw/craw_all_baidu/craws/*
+ie/re_cnn_att/data/*.csv
+ie/re_cnn_att/data/*.txt
+ie/re_cnn_att/data/*.pkl
+ie/re_cnn_att/data/*.json
+ie/re_cnn_att/thirdpart/*
diff --git a/KBQA/patternREfO/get_dict.sh b/KBQA/patternREfO/get_dict.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Export dict for movie and actor in hudong and baidu DB;
+# You need change the user and pwd for your own DB;
+mysql -uroot -pnlp < ./data/get_dict.txt 
+
+sudo cp /var/lib/mysql-files/*Name.txt .
+
+cat baidu_actorName.txt hudong_actorName.txt | sort -u > actorTmp.txt
+cat baidu_movieName.txt hudong_movieName.txt | sort -u > movieTmp.txt
+# Append "nz" and "nr" tag for jieba
+awk '{print $0 " nr"}' actorTmp.txt > actorName.txt
+awk '{print $0 " nz"}' movieTmp.txt > movieName.txt
+
+# Remove redundant file
+rm ^[am].*Name.txt
diff --git a/KBQA/patternREfO/query.py b/KBQA/patternREfO/query.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+from SPARQLWrapper import SPARQLWrapper, JSON
+from utils.word_tagging import Tagger
+from utils.rules import customize_rules
+
+if __name__ == "__main__":
+    print("init...........")
+    sparql_base = SPARQLWrapper("http://localhost:3030/kg_movie/query")
+    tagger = Tagger(['data/actorName.txt', 'data/movieName.txt'])
+    rules = customize_rules()
+    print("done \n")
+
+    while True:
+        print("Please input your question: ")
+        default_question = input()
+        seg_list = tagger.get_word_objects(default_question)
+
+        for rule in rules:
+            query = rule.apply(seg_list)
+            if query:
+                sparql_base.setQuery(query)
+                sparql_base.setReturnFormat(JSON)
+                results = sparql_base.query().convert()
+
+                if not results["results"]["bindings"]:
+                    print("No answer found :(")
+                    continue
+                for result in results["results"]["bindings"]:
+                    print("Result: ", result["x0"]["value"])
diff --git a/KBQA/patternREfO/utils/__init__.py b/KBQA/patternREfO/utils/__init__.py
diff --git a/KBQA/patternREfO/utils/rules.py b/KBQA/patternREfO/utils/rules.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+import re
+from refo import finditer, Predicate, Star, Any
+
+# SPARQL config  
+SPARQL_PREAMBLE = u"""  
+PREFIX : <http://www.kgdemo.com#>
+PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+"""              
+
+SPARQL_TEM = u"{preamble}\n" + \
+             u"SELECT DISTINCT {select} WHERE {{\n" + \
+             u"{expression}\n" + \
+             u"}}\n"    
+
+INDENT = "    "  
+
+class W(Predicate):
+    """object-oriented regex for words"""
+    def __init__(self, token=".*", pos=".*"):
+        self.token = re.compile(token + "$")
+        self.pos = re.compile(pos + "$")
+        super(W, self).__init__(self.match)
+
+    def match(self, word):
+        m1 = self.token.match(word.token)
+        m2 = self.pos.match(word.pos)
+        return m1 and m2
+
+class Rule(object):
+    def __init__(self, condition=None, action=None):
+        assert condition and action
+        self.condition = condition
+        self.action = action
+
+    def apply(self, sentence):
+        matches = []
+        for m in finditer(self.condition, sentence):
+            i, j = m.span()
+            matches.extend(sentence[i:j])
+        if __name__ == '__main__':
+            pass
+        return self.action(matches)
+
+def who_is_question(x):
+    select = u"?x0"
+
+    sparql = None
+    for w in x:
+        if w.pos == "nr" or w.pos == "x":
+            e = u" ?a :actor_chName '{person}'. \n \
+            ?a :actor_bio ?x0".format(person=w.token)
+
+            sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE,
+                                       select=select,
+                                       expression=INDENT + e)
+            break   
+    return sparql 
+
+def where_is_from_question(x):
+    select = u"?x0"
+
+    sparql = None 
+    for w in x:   
+        if w.pos == "nr" or w.pos == "x" or w.pos == "nrt":
+            e = u" ?a :actor_chName '{person}'.\n \
+            ?a :actor_birthPlace ?x0".format(person=w.token)
+
+            sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE,
+                                       select=select,
+                                       expression=INDENT + e)
+            break
+    return sparql
+
+
+def movie_intro_question(x):
+    select = u"?x0"
+
+    sparql = None
+    for w in x:
+        if w.pos == "nz":
+            e = u" ?a :movie_chName '{person}'. \n \
+            ?a :movie_bio ?x0".format(person=w.token)
+
+            sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE,
+                                       select=select,
+                                       expression=INDENT + e)
+            break
+    return sparql
+
+def customize_rules():
+    # some rules for matching
+    # TODO: customize your own rules here
+    person = (W(pos="nr") | W(pos="x") | W(pos="nrt") | W(pos="nz"))
+    movie = (W(pos="nz"))
+    place = (W("出生地") | W("出生"))
+    intro = (W("简介") | W(pos="介绍"))
+
+    rules = [                   
+
+        Rule(condition=W(pos="r") + W("是") + person | \
+                       person + W("是") + W(pos="r"),
+             action=who_is_question),
+
+        Rule(condition=person + Star(Any(), greedy=False) + place + Star(Any(), greedy=False),
+             action=where_is_from_question),
+
+        Rule(condition=movie + Star(Any(), greedy=False) + intro + Star(Any(), greedy=False) ,
+             action=movie_intro_question)
+
+    ]
+    return rules
diff --git a/KBQA/patternREfO/utils/word_tagging.py b/KBQA/patternREfO/utils/word_tagging.py
@@ -0,0 +1,44 @@
+# encoding=utf-8
+
+"""
+
+@author: SimmerChan
+
+@contact: [email protected]
+
+@file: word_tagging.py
+
+@time: 2017/12/20 15:31
+
+@desc: 定义Word类的结构；定义Tagger类，实现自然语言转为Word对象的方法。
+
+"""
+import jieba
+import jieba.posseg as pseg
+
+
+class Word(object):
+    def __init__(self, token, pos):
+        self.token = token
+        self.pos = pos
+
+
+class Tagger:
+    def __init__(self, dict_paths):
+        # TODO 加载外部词典
+        for p in dict_paths:
+            jieba.load_userdict(p)
+
+    def get_word_objects(self, sentence):
+        """
+        Get :class:WOrd(token, pos)
+        """
+        return [Word(bytes.decode(word.encode('utf-8')), tag) for word, tag in pseg.cut(sentence)]
+
+if __name__ == '__main__':
+    tagger = Tagger(['../data/actorName.txt', '../data/movieName.txt'])
+    while True:
+        s = input()
+        print("tagger.get_word_objects(s): ", tagger.get_word_objects(s))
+        for i in tagger.get_word_objects(s):
+            print(i.token, i.pos)
diff --git a/README.md b/README.md
@@ -0,0 +1,5 @@
+# Zero_knowledge_graph
+从零开始构建知识图谱
+
+# 简介
+为了构建中文百科类知识图谱，我们参考漆桂林老师团队做的[zhishi.me](http://zhishi.me/)。目标是包含百度百科、互动百科、中文wiki百科的知识，千万级实体数量和亿级别的关系数目。目前已完成百度百科和互动百科部分，其中百度百科词条4,190,390条，互动百科词条4,382,575条。转换为RDF格式得到三元组 128,596,018个。存入 neo4j中得到节点 16,498,370个，关系 56,371,456个，属性 61,967,517个。<br>
diff --git a/ie/craw/baidu_baike/baidu_baike/__init__.py b/ie/craw/baidu_baike/baidu_baike/__init__.py
diff --git a/ie/craw/baidu_baike/baidu_baike/items.py b/ie/craw/baidu_baike/baidu_baike/items.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class BaiduBaikeItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    # Actor
+    # 包含演员相关属性
+    actor_id = scrapy.Field()
+    actor_bio = scrapy.Field()
+    actor_chName = scrapy.Field()
+    actor_foreName = scrapy.Field()
+    actor_nationality = scrapy.Field()
+    actor_constellation = scrapy.Field()
+    actor_birthPlace = scrapy.Field()
+    actor_birthDay = scrapy.Field()
+    actor_repWorks = scrapy.Field()
+    actor_achiem = scrapy.Field()
+    actor_brokerage = scrapy.Field()
+
+    # movie
+    # 电影相关属性
+    movie_id = scrapy.Field()
+    movie_bio = scrapy.Field()
+    movie_chName = scrapy.Field()
+    movie_foreName = scrapy.Field()
+    movie_prodTime = scrapy.Field()
+    movie_prodCompany = scrapy.Field()
+    movie_director = scrapy.Field()
+    movie_screenwriter = scrapy.Field()
+    movie_genre = scrapy.Field()
+    movie_star = scrapy.Field()
+    movie_length = scrapy.Field()
+    movie_rekeaseTime = scrapy.Field()
+    movie_language = scrapy.Field()
+    movie_achiem = scrapy.Field()