Skip to content

Commit

Permalink
LODlit changes
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreiNesterov committed Sep 11, 2023
1 parent 2e6d4d6 commit 6853ce9
Show file tree
Hide file tree
Showing 18 changed files with 197 additions and 332 deletions.
9 changes: 1 addition & 8 deletions AAT/aat_bows.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -137,18 +137,11 @@
"with open('aat_bows_nl.json', 'w') as jf:\n",
" json.dump(aat_bows_nl, jf)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand Down
87 changes: 0 additions & 87 deletions LODlitParser/lodlitparser.py

This file was deleted.

3 changes: 0 additions & 3 deletions LODlitParser/requirements.txt

This file was deleted.

2 changes: 1 addition & 1 deletion ODWN/querying_odwn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand Down
Binary file added dist/LODlit-0.0.0-py3-none-any.whl
Binary file not shown.
Binary file added dist/LODlit-0.0.0.tar.gz
Binary file not shown.
20 changes: 20 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[LODlit]
dependencies = [
'nltk==3.8.1',
'pandas==2.0.3',
'numpy==1.21.0',
'requests==2.31.0',
'simplemma==0.9.1',
'spacy==3.6.1',
'SPARQLWrapper==2.0.0',
'lxml==4.9.3'
]

[metadata]
name = "LODlit"
version = "1.0.0"
description = "Retrieving literal values from LOD"
author = "Andrei Nesterov"
author_email = "[email protected]"
license = "CC BY 4.0"
keywords = ["LOD", "literals", "linked open data", "strings", "NLP"]
4 changes: 4 additions & 0 deletions src/LODlit.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Metadata-Version: 2.1
Name: LODlit
Version: 0.0.0
License-File: LICENSE
12 changes: 12 additions & 0 deletions src/LODlit.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
LICENSE
README.md
pyproject.toml
src/LODlit/aat.py
src/LODlit/bows.py
src/LODlit/odwn.py
src/LODlit/pwn31.py
src/LODlit/wd.py
src/LODlit.egg-info/PKG-INFO
src/LODlit.egg-info/SOURCES.txt
src/LODlit.egg-info/dependency_links.txt
src/LODlit.egg-info/top_level.txt
1 change: 1 addition & 0 deletions src/LODlit.egg-info/dependency_links.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

1 change: 1 addition & 0 deletions src/LODlit.egg-info/top_level.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
LODlit
133 changes: 49 additions & 84 deletions LODlitParser/aat.py → src/LODlit/aat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Getty AAT
# A module to parse query results (json) from Getty Art&Archiechture Thesaurus
# to use 'get_bows', download stopwords from nltk: nltk.download('stopwords'); install simplelemma
# A module to parse query results (json) from The Getty Art & Archiechture Thesaurus (AAT)

import json
import gzip
Expand Down Expand Up @@ -63,13 +62,16 @@ def sparql(aat_uri:list, lang:str) -> dict:
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

prefLabel = None
altLabels = []
scopeNote = None
prefLabel_comment = None
altLabel_comment = None

for result in results['results']['bindings']:

if 'prefLabel' in result:
prefLabel = result['prefLabel']['value']
if 'altLabels' in result:
altLabels = result['altLabels']['value'].split('#')
if 'scopeNote' in result:
Expand All @@ -80,7 +82,7 @@ def sparql(aat_uri:list, lang:str) -> dict:
altLabel_comment = result['altLabel_comment']['value']

result_dict[uri]['lang'] = lang
result_dict[uri]['prefLabel'] = result['prefLabel']['value']
result_dict[uri]['prefLabel'] = prefLabel
result_dict[uri]['altLabels'] = altLabels
result_dict[uri]['prefLabel_comment'] = prefLabel_comment
result_dict[uri]['altLabel_comment'] = altLabel_comment
Expand Down Expand Up @@ -156,9 +158,11 @@ def find_term_in_literals(query_term:str, lang:str) -> list:
'''

# reading the gzip json file with aat search results
# change the path to GitHub
with gzip.open(f"/Users/anesterov/reps/LODlit/AAT/gzip_aat_subgraph_{lang}.json", 'r') as gzip_json:
aat_gzip = json.loads(gzip_json.read().decode('utf-8'))
# path to raw gzip on GitHub
gzip_path = f"https://github.com/cultural-ai/LODlit/raw/main/AAT/gzip_aat_subgraph_{lang}.json"

# decompressing
aat_gzip = json.loads(gzip.decompress(requests.get(gzip_path).content))

list_of_results = []

Expand Down Expand Up @@ -220,19 +224,18 @@ def find_term_in_literals(query_term:str, lang:str) -> list:

return list_of_results

def get_bows(path_to_results:str, lang:str) -> dict:
def get_bows(lang:str) -> dict:
'''
Getting bag of words (BoW) from the AAT search results for every search term
path_to_results: str, a path to the search results (in json format)
lang: str, 'en' or 'nl'
Returns a dict with BoWs per hit per term: {term:[{aat_URI:['token1','token2','token3']}]}
'''


path_to_results = f"https://github.com/cultural-ai/LODlit/raw/main/AAT/aat_query_results_{lang}.json"
search_results = requests.get(path_to_results).json()

wnl = WordNetLemmatizer()
all_bows = {}

with open(path_to_results,'r') as jf:
search_results = json.load(jf)

for query_term, results in search_results.items():

Expand Down Expand Up @@ -274,61 +277,32 @@ def get_lit_related_matches_bow(lang:str) -> dict:
path_rm = "https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/rm.json"
rm = requests.get(path_rm).json()

# checking lang
if lang == "en":
# change path
with open('/Users/anesterov/reps/LODlit/AAT/aat_bows_en.json','r') as jf:
aat_bows = json.load(jf)

# getting a list of all AAT URIs of related matches
# terms with no related matches won't be included in the file
related_matches_aat = list(set([values["related_matches"]["aat"][0] for values in rm.values() \
if values["lang"] == "en" and values["related_matches"]["aat"][0] != 'None']))

# getting BoWs for AAT concept URIs
related_matches_aat_uri_bows = {}
for uri_rm in related_matches_aat:
for hits in aat_bows.values():
for hit in hits:
for uri, bow in hit.items():
if uri == uri_rm:
related_matches_aat_uri_bows[uri_rm] = bow

# shaping resulting dict: terms with related matches and BoWs
for values in rm.values():
if values["lang"] == "en":
rm_aat = values["related_matches"]["aat"][0]
if rm_aat != "None":
for term in values["query_terms"]:
if rm_aat in related_matches_aat_uri_bows.keys():
results[term] = {"aat_uri":rm_aat,"bow":related_matches_aat_uri_bows[rm_aat]}

if lang == "nl":
# change path
with open('/Users/anesterov/reps/LODlit/AAT/aat_bows_nl.json','r') as jf:
aat_bows = json.load(jf)

# getting a list of all AAT URIs of related matches
related_matches_aat = list(set([values["related_matches"]["aat"][0] for values in rm.values() \
if values["lang"] == "nl" and values["related_matches"]["aat"][0] != 'None']))

# getting BoWs for AAT concept URIs
related_matches_aat_uri_bows = {}
for uri_rm in related_matches_aat:
for hits in aat_bows.values():
for hit in hits:
for uri, bow in hit.items():
if uri == uri_rm:
related_matches_aat_uri_bows[uri_rm] = bow

# shaping resulting dict: terms with related matches and BoWs
for values in rm.values():
if values["lang"] == "nl":
rm_aat = values["related_matches"]["aat"][0]
if rm_aat != "None":
for term in values["query_terms"]:
if rm_aat in related_matches_aat_uri_bows.keys():
results[term] = {"aat_uri":rm_aat,"bow":related_matches_aat_uri_bows[rm_aat]}
# load aat bows
path_to_bows = f"https://github.com/cultural-ai/LODlit/raw/main/AAT/aat_bows_{lang}.json"
aat_bows = requests.get(path_to_bows).json()

# getting a list of all AAT URIs of related matches
# terms with no related matches won't be included in the file
related_matches_aat = list(set([values["related_matches"]["aat"][0] for values in rm.values() \
if values["lang"] == lang and values["related_matches"]["aat"][0] != 'None']))

# getting BoWs for AAT concept URIs
related_matches_aat_uri_bows = {}
for uri_rm in related_matches_aat:
for hits in aat_bows.values():
for hit in hits:
for uri, bow in hit.items():
if uri == uri_rm:
related_matches_aat_uri_bows[uri_rm] = bow

# shaping resulting dict: terms with related matches and BoWs
for values in rm.values():
if values["lang"] == lang:
rm_aat = values["related_matches"]["aat"][0]
if rm_aat != "None":
for term in values["query_terms"]:
if rm_aat in related_matches_aat_uri_bows.keys():
results[term] = {"aat_uri":rm_aat,"bow":related_matches_aat_uri_bows[rm_aat]}

return results

Expand All @@ -342,24 +316,18 @@ def get_cs(lang:str):
query_term, aat_URI, bow, cs_rm, cs_wm, cs_rm_wm
'''

nlp = bows._load_spacy_nlp(lang)
nlp = bows.load_spacy_nlp(lang)

# load bckground info
# change path
with open('/Users/anesterov/reps/LODlit/bg/background_info_bows.json','r') as jf:
bg_info = json.load(jf)

aat_df = pd.DataFrame(columns=['term','hit_id','bow','cs_rm','cs_wm','cs_rm_wm'])
path_bg = "https://github.com/cultural-ai/LODlit/raw/main/bg/background_info_bows.json"
bg_info = requests.get(path_bg).json()

# check lang and load appropriate file
# load aat bows

if lang == "en":
with open('/Users/anesterov/reps/LODlit/AAT/aat_bows_en.json','r') as jf:
aat_bows = json.load(jf)
path_aat_bows = f"https://github.com/cultural-ai/LODlit/raw/main/AAT/aat_bows_{lang}.json"
aat_bows = requests.get(path_aat_bows).json()

if lang == "nl":
with open('/Users/anesterov/reps/LODlit/AAT/aat_bows_nl.json','r') as jf:
aat_bows = json.load(jf)
aat_df = pd.DataFrame(columns=['term','hit_id','bow','cs_rm','cs_wm','cs_rm_wm'])

for term, hits in aat_bows.items():

Expand Down Expand Up @@ -390,7 +358,4 @@ def get_cs(lang:str):
else:
aat_df.loc[len(aat_df)] = [term,None,None,None,None,None]

return aat_df



return aat_df
Loading

0 comments on commit 6853ce9

Please sign in to comment.