forked from myhhub/KnowledgeGraph
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 233d9ec
Showing
132 changed files
with
7,807 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
KBQA/actorName.txt | ||
KBQA/movieName.txt | ||
*.json | ||
*.pyc | ||
semantic_search/elasticsearch/data/*.txt | ||
semantic_search/elasticsearch/data/attr_ac.pkl | ||
ie/deepdive/udf/bazaar/* | ||
ie/deepdive/*.txt | ||
ie/deepdive/run/* | ||
ie/deepdive/*.csv | ||
*.txt | ||
*.csv | ||
ie/craw/craw_all_baidu/craws/* | ||
ie/re_cnn_att/data/*.csv | ||
ie/re_cnn_att/data/*.txt | ||
ie/re_cnn_att/data/*.pkl | ||
ie/re_cnn_att/data/*.json | ||
ie/re_cnn_att/thirdpart/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/bin/bash | ||
|
||
# Export dict for movie and actor in hudong and baidu DB; | ||
# You need change the user and pwd for your own DB; | ||
mysql -uroot -pnlp < ./data/get_dict.txt | ||
|
||
sudo cp /var/lib/mysql-files/*Name.txt . | ||
|
||
cat baidu_actorName.txt hudong_actorName.txt | sort -u > actorTmp.txt | ||
cat baidu_movieName.txt hudong_movieName.txt | sort -u > movieTmp.txt | ||
# Append "nz" and "nr" tag for jieba | ||
awk '{print $0 " nr"}' actorTmp.txt > actorName.txt | ||
awk '{print $0 " nz"}' movieTmp.txt > movieName.txt | ||
|
||
# Remove redundant file | ||
rm ^[am].*Name.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/usr/bin/env python | ||
# coding=utf-8 | ||
|
||
from SPARQLWrapper import SPARQLWrapper, JSON | ||
from utils.word_tagging import Tagger | ||
from utils.rules import customize_rules | ||
|
||
if __name__ == "__main__": | ||
print("init...........") | ||
sparql_base = SPARQLWrapper("http://localhost:3030/kg_movie/query") | ||
tagger = Tagger(['data/actorName.txt', 'data/movieName.txt']) | ||
rules = customize_rules() | ||
print("done \n") | ||
|
||
while True: | ||
print("Please input your question: ") | ||
default_question = input() | ||
seg_list = tagger.get_word_objects(default_question) | ||
|
||
for rule in rules: | ||
query = rule.apply(seg_list) | ||
if query: | ||
sparql_base.setQuery(query) | ||
sparql_base.setReturnFormat(JSON) | ||
results = sparql_base.query().convert() | ||
|
||
if not results["results"]["bindings"]: | ||
print("No answer found :(") | ||
continue | ||
for result in results["results"]["bindings"]: | ||
print("Result: ", result["x0"]["value"]) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
#!/usr/bin/env python | ||
# coding=utf-8 | ||
|
||
import re | ||
from refo import finditer, Predicate, Star, Any | ||
|
||
# SPARQL config | ||
SPARQL_PREAMBLE = u""" | ||
PREFIX : <http://www.kgdemo.com#> | ||
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | ||
""" | ||
|
||
SPARQL_TEM = u"{preamble}\n" + \ | ||
u"SELECT DISTINCT {select} WHERE {{\n" + \ | ||
u"{expression}\n" + \ | ||
u"}}\n" | ||
|
||
INDENT = " " | ||
|
||
class W(Predicate): | ||
"""object-oriented regex for words""" | ||
def __init__(self, token=".*", pos=".*"): | ||
self.token = re.compile(token + "$") | ||
self.pos = re.compile(pos + "$") | ||
super(W, self).__init__(self.match) | ||
|
||
def match(self, word): | ||
m1 = self.token.match(word.token) | ||
m2 = self.pos.match(word.pos) | ||
return m1 and m2 | ||
|
||
class Rule(object): | ||
def __init__(self, condition=None, action=None): | ||
assert condition and action | ||
self.condition = condition | ||
self.action = action | ||
|
||
def apply(self, sentence): | ||
matches = [] | ||
for m in finditer(self.condition, sentence): | ||
i, j = m.span() | ||
matches.extend(sentence[i:j]) | ||
if __name__ == '__main__': | ||
pass | ||
return self.action(matches) | ||
|
||
def who_is_question(x): | ||
select = u"?x0" | ||
|
||
sparql = None | ||
for w in x: | ||
if w.pos == "nr" or w.pos == "x": | ||
e = u" ?a :actor_chName '{person}'. \n \ | ||
?a :actor_bio ?x0".format(person=w.token) | ||
|
||
sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, | ||
select=select, | ||
expression=INDENT + e) | ||
break | ||
return sparql | ||
|
||
def where_is_from_question(x): | ||
select = u"?x0" | ||
|
||
sparql = None | ||
for w in x: | ||
if w.pos == "nr" or w.pos == "x" or w.pos == "nrt": | ||
e = u" ?a :actor_chName '{person}'.\n \ | ||
?a :actor_birthPlace ?x0".format(person=w.token) | ||
|
||
sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, | ||
select=select, | ||
expression=INDENT + e) | ||
break | ||
return sparql | ||
|
||
|
||
def movie_intro_question(x): | ||
select = u"?x0" | ||
|
||
sparql = None | ||
for w in x: | ||
if w.pos == "nz": | ||
e = u" ?a :movie_chName '{person}'. \n \ | ||
?a :movie_bio ?x0".format(person=w.token) | ||
|
||
sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, | ||
select=select, | ||
expression=INDENT + e) | ||
break | ||
return sparql | ||
|
||
def customize_rules(): | ||
# some rules for matching | ||
# TODO: customize your own rules here | ||
person = (W(pos="nr") | W(pos="x") | W(pos="nrt") | W(pos="nz")) | ||
movie = (W(pos="nz")) | ||
place = (W("出生地") | W("出生")) | ||
intro = (W("简介") | W(pos="介绍")) | ||
|
||
rules = [ | ||
|
||
Rule(condition=W(pos="r") + W("是") + person | \ | ||
person + W("是") + W(pos="r"), | ||
action=who_is_question), | ||
|
||
Rule(condition=person + Star(Any(), greedy=False) + place + Star(Any(), greedy=False), | ||
action=where_is_from_question), | ||
|
||
Rule(condition=movie + Star(Any(), greedy=False) + intro + Star(Any(), greedy=False) , | ||
action=movie_intro_question) | ||
|
||
] | ||
return rules |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# encoding=utf-8 | ||
|
||
""" | ||
@author: SimmerChan | ||
@contact: [email protected] | ||
@file: word_tagging.py | ||
@time: 2017/12/20 15:31 | ||
@desc: 定义Word类的结构;定义Tagger类,实现自然语言转为Word对象的方法。 | ||
""" | ||
import jieba | ||
import jieba.posseg as pseg | ||
|
||
|
||
class Word(object): | ||
def __init__(self, token, pos): | ||
self.token = token | ||
self.pos = pos | ||
|
||
|
||
class Tagger: | ||
def __init__(self, dict_paths): | ||
# TODO 加载外部词典 | ||
for p in dict_paths: | ||
jieba.load_userdict(p) | ||
|
||
def get_word_objects(self, sentence): | ||
""" | ||
Get :class:WOrd(token, pos) | ||
""" | ||
return [Word(bytes.decode(word.encode('utf-8')), tag) for word, tag in pseg.cut(sentence)] | ||
|
||
if __name__ == '__main__': | ||
tagger = Tagger(['../data/actorName.txt', '../data/movieName.txt']) | ||
while True: | ||
s = input() | ||
print("tagger.get_word_objects(s): ", tagger.get_word_objects(s)) | ||
for i in tagger.get_word_objects(s): | ||
print(i.token, i.pos) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# Zero_knowledge_graph | ||
从零开始构建知识图谱 | ||
|
||
# 简介 | ||
为了构建中文百科类知识图谱,我们参考漆桂林老师团队做的[zhishi.me](http://zhishi.me/)。目标是包含百度百科、互动百科、中文wiki百科的知识,千万级实体数量和亿级别的关系数目。目前已完成百度百科和互动百科部分,其中百度百科词条4,190,390条,互动百科词条4,382,575条。转换为RDF格式得到三元组 128,596,018个。存入 neo4j中得到节点 16,498,370个,关系 56,371,456个,属性 61,967,517个。<br> |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Define here the models for your scraped items | ||
# | ||
# See documentation in: | ||
# https://doc.scrapy.org/en/latest/topics/items.html | ||
|
||
import scrapy | ||
|
||
|
||
class BaiduBaikeItem(scrapy.Item): | ||
# define the fields for your item here like: | ||
# name = scrapy.Field() | ||
# Actor | ||
# 包含演员相关属性 | ||
actor_id = scrapy.Field() | ||
actor_bio = scrapy.Field() | ||
actor_chName = scrapy.Field() | ||
actor_foreName = scrapy.Field() | ||
actor_nationality = scrapy.Field() | ||
actor_constellation = scrapy.Field() | ||
actor_birthPlace = scrapy.Field() | ||
actor_birthDay = scrapy.Field() | ||
actor_repWorks = scrapy.Field() | ||
actor_achiem = scrapy.Field() | ||
actor_brokerage = scrapy.Field() | ||
|
||
# movie | ||
# 电影相关属性 | ||
movie_id = scrapy.Field() | ||
movie_bio = scrapy.Field() | ||
movie_chName = scrapy.Field() | ||
movie_foreName = scrapy.Field() | ||
movie_prodTime = scrapy.Field() | ||
movie_prodCompany = scrapy.Field() | ||
movie_director = scrapy.Field() | ||
movie_screenwriter = scrapy.Field() | ||
movie_genre = scrapy.Field() | ||
movie_star = scrapy.Field() | ||
movie_length = scrapy.Field() | ||
movie_rekeaseTime = scrapy.Field() | ||
movie_language = scrapy.Field() | ||
movie_achiem = scrapy.Field() |
Oops, something went wrong.