Skip to content

Commit

Permalink
all
Browse files Browse the repository at this point in the history
  • Loading branch information
myhhub committed Oct 17, 2019
0 parents commit 233d9ec
Show file tree
Hide file tree
Showing 132 changed files with 7,807 additions and 0 deletions.
18 changes: 18 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
KBQA/actorName.txt
KBQA/movieName.txt
*.json
*.pyc
semantic_search/elasticsearch/data/*.txt
semantic_search/elasticsearch/data/attr_ac.pkl
ie/deepdive/udf/bazaar/*
ie/deepdive/*.txt
ie/deepdive/run/*
ie/deepdive/*.csv
*.txt
*.csv
ie/craw/craw_all_baidu/craws/*
ie/re_cnn_att/data/*.csv
ie/re_cnn_att/data/*.txt
ie/re_cnn_att/data/*.pkl
ie/re_cnn_att/data/*.json
ie/re_cnn_att/thirdpart/*
16 changes: 16 additions & 0 deletions KBQA/patternREfO/get_dict.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

# Export dict for movie and actor in hudong and baidu DB;
# You need change the user and pwd for your own DB;
mysql -uroot -pnlp < ./data/get_dict.txt

sudo cp /var/lib/mysql-files/*Name.txt .

cat baidu_actorName.txt hudong_actorName.txt | sort -u > actorTmp.txt
cat baidu_movieName.txt hudong_movieName.txt | sort -u > movieTmp.txt
# Append "nz" and "nr" tag for jieba
awk '{print $0 " nr"}' actorTmp.txt > actorName.txt
awk '{print $0 " nz"}' movieTmp.txt > movieName.txt

# Remove redundant file
rm ^[am].*Name.txt
31 changes: 31 additions & 0 deletions KBQA/patternREfO/query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python
# coding=utf-8

from SPARQLWrapper import SPARQLWrapper, JSON
from utils.word_tagging import Tagger
from utils.rules import customize_rules

if __name__ == "__main__":
print("init...........")
sparql_base = SPARQLWrapper("http://localhost:3030/kg_movie/query")
tagger = Tagger(['data/actorName.txt', 'data/movieName.txt'])
rules = customize_rules()
print("done \n")

while True:
print("Please input your question: ")
default_question = input()
seg_list = tagger.get_word_objects(default_question)

for rule in rules:
query = rule.apply(seg_list)
if query:
sparql_base.setQuery(query)
sparql_base.setReturnFormat(JSON)
results = sparql_base.query().convert()

if not results["results"]["bindings"]:
print("No answer found :(")
continue
for result in results["results"]["bindings"]:
print("Result: ", result["x0"]["value"])
Empty file.
115 changes: 115 additions & 0 deletions KBQA/patternREfO/utils/rules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python
# coding=utf-8

import re
from refo import finditer, Predicate, Star, Any

# SPARQL config
SPARQL_PREAMBLE = u"""
PREFIX : <http://www.kgdemo.com#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
"""

SPARQL_TEM = u"{preamble}\n" + \
u"SELECT DISTINCT {select} WHERE {{\n" + \
u"{expression}\n" + \
u"}}\n"

INDENT = " "

class W(Predicate):
"""object-oriented regex for words"""
def __init__(self, token=".*", pos=".*"):
self.token = re.compile(token + "$")
self.pos = re.compile(pos + "$")
super(W, self).__init__(self.match)

def match(self, word):
m1 = self.token.match(word.token)
m2 = self.pos.match(word.pos)
return m1 and m2

class Rule(object):
def __init__(self, condition=None, action=None):
assert condition and action
self.condition = condition
self.action = action

def apply(self, sentence):
matches = []
for m in finditer(self.condition, sentence):
i, j = m.span()
matches.extend(sentence[i:j])
if __name__ == '__main__':
pass
return self.action(matches)

def who_is_question(x):
select = u"?x0"

sparql = None
for w in x:
if w.pos == "nr" or w.pos == "x":
e = u" ?a :actor_chName '{person}'. \n \
?a :actor_bio ?x0".format(person=w.token)

sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE,
select=select,
expression=INDENT + e)
break
return sparql

def where_is_from_question(x):
select = u"?x0"

sparql = None
for w in x:
if w.pos == "nr" or w.pos == "x" or w.pos == "nrt":
e = u" ?a :actor_chName '{person}'.\n \
?a :actor_birthPlace ?x0".format(person=w.token)

sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE,
select=select,
expression=INDENT + e)
break
return sparql


def movie_intro_question(x):
select = u"?x0"

sparql = None
for w in x:
if w.pos == "nz":
e = u" ?a :movie_chName '{person}'. \n \
?a :movie_bio ?x0".format(person=w.token)

sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE,
select=select,
expression=INDENT + e)
break
return sparql

def customize_rules():
# some rules for matching
# TODO: customize your own rules here
person = (W(pos="nr") | W(pos="x") | W(pos="nrt") | W(pos="nz"))
movie = (W(pos="nz"))
place = (W("出生地") | W("出生"))
intro = (W("简介") | W(pos="介绍"))

rules = [

Rule(condition=W(pos="r") + W("是") + person | \
person + W("是") + W(pos="r"),
action=who_is_question),

Rule(condition=person + Star(Any(), greedy=False) + place + Star(Any(), greedy=False),
action=where_is_from_question),

Rule(condition=movie + Star(Any(), greedy=False) + intro + Star(Any(), greedy=False) ,
action=movie_intro_question)

]
return rules
44 changes: 44 additions & 0 deletions KBQA/patternREfO/utils/word_tagging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# encoding=utf-8

"""
@author: SimmerChan
@contact: [email protected]
@file: word_tagging.py
@time: 2017/12/20 15:31
@desc: 定义Word类的结构;定义Tagger类,实现自然语言转为Word对象的方法。
"""
import jieba
import jieba.posseg as pseg


class Word(object):
def __init__(self, token, pos):
self.token = token
self.pos = pos


class Tagger:
def __init__(self, dict_paths):
# TODO 加载外部词典
for p in dict_paths:
jieba.load_userdict(p)

def get_word_objects(self, sentence):
"""
Get :class:WOrd(token, pos)
"""
return [Word(bytes.decode(word.encode('utf-8')), tag) for word, tag in pseg.cut(sentence)]

if __name__ == '__main__':
tagger = Tagger(['../data/actorName.txt', '../data/movieName.txt'])
while True:
s = input()
print("tagger.get_word_objects(s): ", tagger.get_word_objects(s))
for i in tagger.get_word_objects(s):
print(i.token, i.pos)
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Zero_knowledge_graph
从零开始构建知识图谱

# 简介
为了构建中文百科类知识图谱,我们参考漆桂林老师团队做的[zhishi.me](http://zhishi.me/)。目标是包含百度百科、互动百科、中文wiki百科的知识,千万级实体数量和亿级别的关系数目。目前已完成百度百科和互动百科部分,其中百度百科词条4,190,390条,互动百科词条4,382,575条。转换为RDF格式得到三元组 128,596,018个。存入 neo4j中得到节点 16,498,370个,关系 56,371,456个,属性 61,967,517个。<br>
Empty file.
43 changes: 43 additions & 0 deletions ie/craw/baidu_baike/baidu_baike/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class BaiduBaikeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# Actor
# 包含演员相关属性
actor_id = scrapy.Field()
actor_bio = scrapy.Field()
actor_chName = scrapy.Field()
actor_foreName = scrapy.Field()
actor_nationality = scrapy.Field()
actor_constellation = scrapy.Field()
actor_birthPlace = scrapy.Field()
actor_birthDay = scrapy.Field()
actor_repWorks = scrapy.Field()
actor_achiem = scrapy.Field()
actor_brokerage = scrapy.Field()

# movie
# 电影相关属性
movie_id = scrapy.Field()
movie_bio = scrapy.Field()
movie_chName = scrapy.Field()
movie_foreName = scrapy.Field()
movie_prodTime = scrapy.Field()
movie_prodCompany = scrapy.Field()
movie_director = scrapy.Field()
movie_screenwriter = scrapy.Field()
movie_genre = scrapy.Field()
movie_star = scrapy.Field()
movie_length = scrapy.Field()
movie_rekeaseTime = scrapy.Field()
movie_language = scrapy.Field()
movie_achiem = scrapy.Field()
Loading

0 comments on commit 233d9ec

Please sign in to comment.