-
Notifications
You must be signed in to change notification settings - Fork 0
/
0.py
63 lines (58 loc) · 2.58 KB
/
0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# This code snippet is used to check the similarity between two Data Catalogs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Define the texts to compare
text1 = '''
ex:dataset-001
a dcat:Dataset ;
dcterms:title "Machine parameters"@en ;
dcterms:title "Parametros de la maquina"@es ;
dcat:keyword "accountability"@en, "transparency"@en;
dcat:keyword "responsabilidad"@es, "transparencia"@es;
dcterms:creator ex:alejandro-espert ;
dcterms:issued "2022-10-05"^^xsd:date ;
dcterms:modified "2022-10-15"^^xsd:date ;
dcat:contactPoint <https://www.teaming-ai.eu/somebody/contact> ;
dcterms:temporal [ a dcterms:PeriodOfTime ;
dcat:startDate "2022-07-01"^^xsd:date ;
dcat:endDate "2022-09-30"^^xsd:date ;
];
dcat:temporalResolution "P1D"^^xsd:duration ;
dcterms:spatial <http://sws.geonames.org/6695072/> ;
dcat:spatialResolutionInMeters "1.0"^^xsd:decimal ;
dcterms:publisher ex:industrias-alegre ;
dcterms:language <http://id.loc.gov/vocabulary/iso639-1/en> ;
dcterms:accrualPeriodicity <http://purl.org/linked-data/sdmx/2009/code#freq-W> ;
dcat:distribution ex:dataset-001-csv ;
dcat:distribution ex:dataset-001-ttl ;
'''
text2 = '''
ex:dataset-002
a dcat:Dataset ;
dcterms:title "Imaginary dataset"@en ;
dcterms:title "Conjunto de datos imaginario"@es ;
dcat:keyword "accountability"@en, "transparency"@en, "payments"@en ;
dcat:keyword "responsabilidad"@es, "transparencia"@es, "pagos"@es ;
dcterms:creator ex:finance-employee-001 ;
dcterms:issued "2011-12-05"^^xsd:date ;
dcterms:modified "2011-12-15"^^xsd:date ;
dcat:contactPoint <http://dcat.example.org/transparency-office/contact> ;
dcterms:temporal [ a dcterms:PeriodOfTime ;
dcat:startDate "2011-07-01"^^xsd:date ;
dcat:endDate "2011-09-30"^^xsd:date ;
];
dcat:temporalResolution "P1D"^^xsd:duration ;
dcterms:spatial <http://sws.geonames.org/6695072/> ;
dcat:spatialResolutionInMeters "30.0"^^xsd:decimal ;
dcterms:publisher ex:finance-ministry ;
dcterms:language <http://id.loc.gov/vocabulary/iso639-1/en> ;
dcterms:accrualPeriodicity <http://purl.org/linked-data/sdmx/2009/code#freq-W> ;
dcat:distribution ex:dataset-002-csv ;
dcat:distribution ex:dataset-002-ttl ;
'''
# Define the vectorizer and compute the tf-idf matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([text1, text2])
# Compute the cosine similarity between the two texts
cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
print(f"The cosine similarity between the two texts is: {cosine_sim}")