diff --git a/.gitignore b/.gitignore index cf685f0..34c116d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ ## fichiers provisoirement invisibles /notebooks_jupyter/dbpedia_exploration/dbp_merge_relations_interactif.ipynb -/notebooks_jupyter/wikidata_exploration/wikidata_get_data_from_dbpedia.ipynb /notebooks_jupyter/wikidata_exploration/wikidata_get_data_from_dbpedia_rdflib.ipynb # CSV non versionnés, sauf exceptions diff --git a/Wikidata/codage_occupations.sql b/Wikidata/codage_occupations.sql index 8ecc334..4e2b66a 100644 --- a/Wikidata/codage_occupations.sql +++ b/Wikidata/codage_occupations.sql @@ -132,19 +132,6 @@ ORDER BY eff DESC LIMIT 100; -WTIH tw1 AS ( -SELECT DISTINCT wp.personUri, wp.personLabel, od.label -FROM wdt_person_occupation po - JOIN wdt_occupation wo ON po.occupationUri = wo.wdt_uri - JOIN wdt_personne wp ON wp.personUri = po.personUri - LEFT JOIN occupation_domain od ON od.pk_occupation_domain = wo.fk_domain - ORDER BY wp.personUri, od.label) -SELECT wp.personUri, wp.personLabel, count(*), group_concat(od.label) -FROM tw1 -GROUP BY wp.personUri, wp.personLabel; - - - -- regrouper par effectifs de domaines WITH tw1 AS ( diff --git a/notebooks_jupyter/wikidata_exploration/wikidata_get_data_from_dbpedia.ipynb b/notebooks_jupyter/wikidata_exploration/wikidata_get_data_from_dbpedia.ipynb new file mode 100644 index 0000000..a114958 --- /dev/null +++ b/notebooks_jupyter/wikidata_exploration/wikidata_get_data_from_dbpedia.ipynb @@ -0,0 +1,1577 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "99893f8e-ad17-4d36-b2a2-945ed55f567f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "d30b11d8-3b99-4b9c-ab3c-67048bf27de9", + "metadata": {}, + "source": [ + "## Note générale\n", + "\n", + "* DBpedia chapters dans différentes langues\n", + "* Propriétés etc." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5f6d1a66-be62-4939-886d-4667a08852bc", + "metadata": {}, + "outputs": [], + "source": [ + "### Librairies à installer dans l'environnement conda\n", + "# qui sera choisi pour exécuter le carnet\n", + "from SPARQLWrapper import SPARQLWrapper, SPARQLWrapper2, JSON, TURTLE, XML, RDFXML\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0784fb0d-85e4-418a-9ee5-55935428e71b", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "### Librairies système\n", + "from importlib import reload\n", + "import sqlite3 as sql\n", + "import datetime\n", + "import sys" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "50937e91-7316-4431-b360-ae4ff7117d44", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "### Importer un module de fonctions crées ad hoc\n", + "# Le module 'sparql_functions.py' n'est rien d'autre qu'un fichier \n", + "# qui contient des fonctions qui seront réutilisées souvent\n", + "## ATTENTION : le fichier 'sparql_functions.py' doit se trouver \n", + "# dans un dossier qui se situe dans le chemin ('path') de recherche\n", + "#  qui est vu par le présent carnet Jupyter afin que l'importation \n", + "# fonctionne correctement\n", + "\n", + "# Add parent directory to the path\n", + "sys.path.insert(0, '..')\n", + "\n", + "### If you want to add the parent-parent directory,\n", + "sys.path.insert(0, '../..')\n", + "\n", + "\n", + "import sparql_functions as spqf\n", + "\n", + "## si on modifie fichier, il faut le recharger\n", + "# ce n'est généralement pas nécessaire donc commenté\n", + "\n", + "# reload(spqf)" + ] + }, + { + "cell_type": "markdown", + "id": "a660e615", + "metadata": {}, + "source": [ + "## Créer une table dans la base de données pour stocker les URI\n", + "\n", + "Dans cette table on va mettre les URI se référant aux mêmes individus du monde, les mêmes _ressources_ selon l'expression technique, auxquels correspondent les URI de différents systèmes d'information.\n", + "\n", + "\n", + "Ici il s'agit des URI de Wikidata et de DBpedia, éventuellement dans les différentes versions linguistiques de DBPedia.\n", + "\n", + "\n", + "En général il vaut mieux toujours mettre l'URI d'un système en premier, par ex. Wikidata, et DBpedia ensuites.\n", + "\n", + "\n", + "On peut ainsi profiter de l'index _UNIQUE_ qui fait que si on reinsère une paire d'URI déjà existante, dans le même ordre, l'insertion sera ignorée et on évitera de produire des lignes en trop. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "511744d3-2a07-450a-b261-dd25aab67f18", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "### Requête SQL qui créer la table dans la BD\n", + "\n", + "create_table_uri = \"\"\"\n", + "CREATE TABLE same_as_uri (\n", + " pk_same_as_uri INTEGER PRIMARY KEY AUTOINCREMENT,\n", + " subject TEXT,\n", + " object TEXT,\n", + " notes TEXT,\n", + " UNIQUE (\n", + " subject,\n", + " object\n", + " )\n", + " ON CONFLICT IGNORE);\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "26563d2c-820b-40f1-95d2-95614c854e05", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "### Requête qui écrase toute la table si on veut la refaire\n", + "# ATTENTION : cette mesure est irréversible\n", + "drop_table_uri = \"\"\"\n", + "DROP TABLE same_as_uri;\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3f05c4e9-dc7e-4f8e-8741-40543f98fa83", + "metadata": {}, + "outputs": [], + "source": [ + "create_table_statement = \"\"\"\n", + "CREATE TABLE IF NOT EXISTS statement (\n", + " pk_statement INTEGER PRIMARY KEY AUTOINCREMENT,\n", + " subject TEXT,\n", + " property TEXT,\n", + " object TEXT,\n", + " notes TEXT,\n", + " import_notes TEXT,\n", + " -- la prochaine ligne seulement si la table existe\n", + " fk_sparql_query INTEGER REFERENCES sparql_query (pk_sparql_query) MATCH SIMPLE,\n", + " UNIQUE (\n", + " subject,\n", + " property,\n", + " object\n", + " )\n", + " ON CONFLICT IGNORE);\n", + " \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b9aa5fc8-6923-4ada-adf4-777c611bbf8a", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Se connecter à la base de données dans laquelle on va insérer\n", + "# le résultat de la requête SPARQL\n", + "cn = sql.connect('../../data/astronomers_import.db')\n", + "cn" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6797e125-47d9-449a-9577-ddf2a41c7721", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error: table same_as_uri already exists\n" + ] + } + ], + "source": [ + "### Exécuter une requête SQL\n", + "cur = cn.cursor()\n", + "try:\n", + " l = cur.execute(create_table_uri)\n", + "except Exception as e:\n", + " print('Error:', e)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "1a5d86df-38f0-4d0b-8b28-8ee9142d6ce6", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "### Cette instruction rend la modification persistante\n", + "# ATTENTION : la mesure est irrévocable, donc commenté par prudence\n", + "# Décommenter pour exécuter\n", + "\n", + "# cn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "13d45586-4cb7-4a15-b881-df556711e776", + "metadata": {}, + "outputs": [], + "source": [ + "### Fermer la connexion à la fin du travail\n", + "cn.close()" + ] + }, + { + "cell_type": "markdown", + "id": "a8c46c0b", + "metadata": {}, + "source": [ + "## Récupérer les données sameAs depuis le point d'accès Wikidata" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d8a14d74-dfb0-4ea6-9a32-f3aa2036fbe8", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "## define SPARQL enpoint\n", + "endpoint = \"https://query.wikidata.org/sparql\"" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b97ef6ec-8213-40bf-a7ee-97107b5efd13", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "q1 = \"\"\"\n", + "PREFIX owl: \n", + "PREFIX wd: \n", + "PREFIX wdt: \n", + "\n", + "SELECT DISTINCT ?item ?uri_dbpedia\n", + " WHERE {\n", + " # SERVICE \n", + " {\n", + " \n", + " {\n", + " {?item wdt:P106 wd:Q169470}\n", + " UNION\n", + " {?item wdt:P106 wd:Q11063}\n", + " UNION\n", + " {?item wdt:P106 wd:Q155647}\n", + " }\n", + " \n", + " ?item wdt:P31 wd:Q5; # Any instance of a human.\n", + " wdt:P569 ?birthDate.\n", + " BIND(REPLACE(str(?birthDate), \"(.*)([0-9]{4})(.*)\", \"$2\") AS ?year)\n", + " FILTER(xsd:integer(?year) > 1300 )\n", + " # OPTIONAL {\n", + " ?article schema:about ?item .\n", + " ?article schema:inLanguage \"en\" .\n", + " FILTER (SUBSTR(str(?article), 1, 25) = \"https://en.wikipedia.org/\")\n", + " BIND (replace(str(?article), \"https://en.wikipedia.org/wiki/\", \"http://dbpedia.org/resource/\") AS ?uri_dbpedia) \n", + "\n", + "# }\n", + " }\n", + " }\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ed1f769f-7663-4149-b88b-ba4103d0daef", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "### Executer la requête avec les fonctions de la librairie locale\n", + "qr = spqf.get_json_sparql_result(endpoint,q1)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "716679dc-515e-4ec5-b8b8-a57ee492ef6a", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13267\n" + ] + }, + { + "data": { + "text/plain": [ + "[['http://www.wikidata.org/entity/Q27323',\n", + " 'http://dbpedia.org/resource/Friedrich_Risner'],\n", + " ['http://www.wikidata.org/entity/Q27374',\n", + " 'http://dbpedia.org/resource/Karl_Eduard_Zetzsche'],\n", + " ['http://www.wikidata.org/entity/Q64839',\n", + " 'http://dbpedia.org/resource/Max_Wien']]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Stocker le résultat dans une variable liste et afficher\n", + "r = [l[:3] for l in spqf.sparql_result_to_list(qr)]\n", + "print(len(r))\n", + "r[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f6f180bb", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "### Se connecter à la base de données dans laquelle on va insérer\n", + "# le résultat de la requête SPARQL\n", + "cn = sql.connect('../../data/astronomers_import.db')\n", + "# cn" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "2ad6e7a1-713a-4ee2-9c7c-0738b5bc662d", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "### Insérer le nouvelles lignes\n", + "# ATTENTION : si elles existent déjà, dans le même ordre rien ne sera ajouté\n", + "# Noter que c'eset toute la liste qui est insérée\n", + "cur = cn.cursor()\n", + "cur.executemany('''INSERT INTO same_as_uri(subject,object) VALUES (?,?)''', r)\n", + "cn.commit()" + ] + }, + { + "cell_type": "markdown", + "id": "fcf8ef2b-2d99-4e65-9fe8-866316dfc0ac", + "metadata": {}, + "source": [ + "## Récupérer les identifiants DBpedia d'une partie de la population et interroger DBpedia" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "4a052359-3e22-445b-9754-922219be4d1d", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "### \n", + "cur = cn.cursor()\n", + "l1 = cur.execute(\"\"\"SELECT sau.\"object\", sau.subject\n", + "FROM wdt_personne wp, same_as_uri sau \n", + "WHERE sau.subject = wp.personUri\n", + "AND wp.birthYear > 1780\"\"\").fetchall()\n", + "# l1 = cur.execute(\"SELECT object FROM same_as_uri limit 200 offset 300\").fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "6cd0a5b4-016a-4375-a2ab-537702f59589", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://dbpedia.org/resource/David_Brewster\n", + "http://dbpedia.org/resource/Joseph_Johann_von_Littrow\n", + "http://dbpedia.org/resource/Giovanni_Plana\n", + "http://dbpedia.org/resource/Sim%C3%A9on_Denis_Poisson\n", + "http://dbpedia.org/resource/Henry_Atkinson_(scientist)\n" + ] + } + ], + "source": [ + "_l = [print(e[0]) for e in l1[:5]]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "26fd120e-d5bd-457b-9ff3-cec20a4c144d", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11992\n" + ] + } + ], + "source": [ + "print(len(l1))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "7f270042-742b-4ab3-963b-0dd0e8e446b7", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "## define SPARQL enpoint\n", + "endpoint_dbp = \"https://dbpedia.org/sparql\"" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "646fd434", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "300\n" + ] + }, + { + "data": { + "text/plain": [ + "(['{ ?p ?o} UNION ',\n", + " '{ ?p ?o} UNION ',\n", + " '{ ?p ?o} UNION '],\n", + " '{ ?p ?o}')" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Préparer la liste des individus qui servira de base à la requête\n", + "lq = []\n", + "i = 1\n", + "\n", + "la = l1[1000:1300]\n", + "ll = len(la)\n", + "\n", + "for e in la:\n", + " if i < ll:\n", + " lq.append('{<' + e[0] + '> ?p ?o} UNION ')\n", + " i += 1\n", + " else:\n", + " lq.append('{ <' + e[0] + '> ?p ?o}')\n", + "\n", + "\n", + "print(len(lq))\n", + "lq[:3], lq[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "4da04fe7", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{ ?p ?o} UNION { ?p ?o} UNION { ?p ?o} UNION { ?p ?o} UNION { ?p ?o} UNION { ?p ?o} UNION { ?p ?o} UNION {\n" + ] + } + ], + "source": [ + "### Transformer la liste en chaîne de caractères\n", + "jlq = ''.join(lq)\n", + "# Ispecter les premières 500 caractères\n", + "print(jlq[:500])" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "839f5f2a-5413-4a55-a341-a4632f74ed2b", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "### Préparer la requête SPARQl à exécuter\n", + "q2 = \"\"\"\n", + "PREFIX dbr: \n", + "PREFIX rdfs: \n", + "PREFIX dbp: \n", + "\n", + "SELECT ?p (COUNT(*) as ?eff)\n", + "WHERE {\n", + " \n", + " {\n", + " \n", + "\"\"\" + jlq + \"\"\"\n", + "\n", + " }\n", + "\n", + "}\n", + "GROUP BY ?p\n", + "ORDER BY DESC(?eff)\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "6befc6b2", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "### Executer la requête avec les fonctions de la librairie locale\n", + "qr = spqf.get_json_sparql_result(endpoint_dbp,q2)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "6920743f", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "60\n", + "[['http://dbpedia.org/ontology/wikiPageWikiLink', '11968'],\n", + " ['http://dbpedia.org/ontology/abstract', '2030'],\n", + " ['http://dbpedia.org/ontology/wikiPageExternalLink', '888'],\n", + " ['http://dbpedia.org/ontology/wikiPageLength', '260'],\n", + " ['http://dbpedia.org/ontology/wikiPageID', '260'],\n", + " ['http://dbpedia.org/ontology/wikiPageRevisionID', '260'],\n", + " ['http://dbpedia.org/ontology/birthPlace', '217'],\n", + " ['http://dbpedia.org/ontology/knownFor', '197'],\n", + " ['http://dbpedia.org/ontology/thumbnail', '191'],\n", + " ['http://dbpedia.org/ontology/deathPlace', '172'],\n", + " ['http://dbpedia.org/ontology/birthDate', '135'],\n", + " ['http://dbpedia.org/ontology/deathDate', '126'],\n", + " ['http://dbpedia.org/ontology/academicDiscipline', '119'],\n", + " ['http://dbpedia.org/ontology/almaMater', '106'],\n", + " ['http://dbpedia.org/ontology/award', '105'],\n", + " ['http://dbpedia.org/ontology/institution', '55'],\n", + " ['http://dbpedia.org/ontology/doctoralStudent', '54'],\n", + " ['http://dbpedia.org/ontology/occupation', '43'],\n", + " ['http://dbpedia.org/ontology/nationality', '36'],\n", + " ['http://dbpedia.org/ontology/doctoralAdvisor', '33'],\n", + " ['http://dbpedia.org/ontology/deathYear', '27'],\n", + " ['http://dbpedia.org/ontology/birthYear', '27'],\n", + " ['http://dbpedia.org/ontology/birthName', '23'],\n", + " ['http://dbpedia.org/ontology/academicAdvisor', '19'],\n", + " ['http://dbpedia.org/ontology/careerStation', '17'],\n", + " ['http://dbpedia.org/ontology/team', '14'],\n", + " ['http://dbpedia.org/ontology/influencedBy', '12'],\n", + " ['http://dbpedia.org/ontology/notableStudent', '11'],\n", + " ['http://dbpedia.org/ontology/restingPlace', '10'],\n", + " ['http://dbpedia.org/ontology/termPeriod', '8'],\n", + " ['http://dbpedia.org/ontology/child', '7'],\n", + " ['http://dbpedia.org/ontology/spouse', '6'],\n", + " ['http://dbpedia.org/ontology/citizenship', '5'],\n", + " ['http://dbpedia.org/ontology/residence', '5'],\n", + " ['http://dbpedia.org/ontology/education', '5'],\n", + " ['http://dbpedia.org/ontology/significantProject', '5'],\n", + " ['http://dbpedia.org/ontology/influenced', '4'],\n", + " ['http://dbpedia.org/ontology/stateOfOrigin', '4'],\n", + " ['http://dbpedia.org/ontology/signature', '3'],\n", + " ['http://dbpedia.org/ontology/position', '2'],\n", + " ['http://dbpedia.org/ontology/alias', '1'],\n", + " ['http://dbpedia.org/ontology/militaryBranch', '1'],\n", + " ['http://dbpedia.org/ontology/relation', '1'],\n", + " ['http://dbpedia.org/ontology/motto', '1'],\n", + " ['http://dbpedia.org/ontology/successor', '1'],\n", + " ['http://dbpedia.org/ontology/country', '1'],\n", + " ['http://dbpedia.org/ontology/activeYearsStartYear', '1'],\n", + " ['http://dbpedia.org/ontology/discipline', '1'],\n", + " ['http://dbpedia.org/ontology/serviceStartYear', '1'],\n", + " ['http://dbpedia.org/ontology/religion', '1']]\n" + ] + } + ], + "source": [ + "r = [l for l in spqf.sparql_result_to_list(qr) if 'ontolo' in l[0]]\n", + "print(len(r))\n", + "import pprint\n", + "pprint.pprint(r[:50])" + ] + }, + { + "cell_type": "markdown", + "id": "326ed3ec", + "metadata": {}, + "source": [ + "### Traitement de l'ensemble" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "72126ecb", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "600\n", + "900\n", + "1200\n", + "1500\n", + "1800\n", + "2100\n", + "2400\n", + "2700\n", + "3000\n", + "3300\n", + "3600\n", + "3900\n", + "4200\n", + "4500\n", + "4800\n", + "5100\n", + "5400\n", + "5700\n", + "6000\n", + "6300\n", + "6600\n", + "6900\n", + "7200\n", + "7500\n", + "7800\n", + "8100\n", + "8400\n", + "8700\n", + "9000\n", + "9300\n", + "9600\n", + "9900\n", + "10200\n", + "10500\n", + "10800\n", + "11100\n", + "11400\n", + "11700\n", + "12000\n" + ] + } + ], + "source": [ + "ia = 0\n", + "ib = 300\n", + "\n", + "l_final = []\n", + "\n", + "while ib < len(l1):\n", + " lq = []\n", + " i = 1\n", + "\n", + " la = l1[ia:ib]\n", + " ll = len(la)\n", + "\n", + " for e in la:\n", + " if i < ll:\n", + " lq.append('{<' + e[0] + '> ?p ?o} UNION ')\n", + " i += 1\n", + " else:\n", + " lq.append('{ <' + e[0] + '> ?p ?o}')\n", + "\n", + " jlq = ''.join(lq)\n", + " q2 = \"\"\"\n", + " PREFIX dbr: \n", + " PREFIX rdfs: \n", + " PREFIX dbp: \n", + "\n", + " SELECT ?p (COUNT(*) as ?eff)\n", + " WHERE {\n", + " \n", + " {\n", + " \n", + " \"\"\" + jlq + \"\"\"\n", + "\n", + " }\n", + "\n", + " }\n", + " GROUP BY ?p\n", + " ORDER BY DESC(?eff)\n", + " \"\"\"\n", + "\n", + " qr = spqf.get_json_sparql_result(endpoint_dbp,q2)\n", + " r = [l for l in spqf.sparql_result_to_list(qr) if 'ontolo' in l[0]]\n", + " l_final += r\n", + "\n", + " ia += 300\n", + " ib += 300\n", + " print(ib)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "c1fd0469", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2189\n" + ] + } + ], + "source": [ + "print(len(l_final))" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "55ab2558", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 2189 entries, 0 to 2188\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 p 2189 non-null object\n", + " 1 eff 2189 non-null object\n", + "dtypes: object(2)\n", + "memory usage: 34.3+ KB\n" + ] + }, + { + "data": { + "text/plain": [ + "( p eff\n", + " 0 http://dbpedia.org/ontology/wikiPageWikiLink 12858\n", + " 1 http://dbpedia.org/ontology/abstract 2537\n", + " 2 http://dbpedia.org/ontology/wikiPageExternalLink 1055\n", + " 3 http://dbpedia.org/ontology/birthPlace 263\n", + " 4 http://dbpedia.org/ontology/wikiPageLength 259,\n", + " None)" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_lf = pd.DataFrame(l_final, columns=['p', 'eff'])\n", + "df_lf.head(), df_lf.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "b569eb13", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "df_lf['eff'] = df_lf['eff'].astype('int64')" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "9c5ae8fe", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eff
p
http://dbpedia.org/ontology/wikiPageWikiLink441882
http://dbpedia.org/ontology/abstract56882
http://dbpedia.org/ontology/wikiPageExternalLink31014
http://dbpedia.org/ontology/wikiPageID10324
http://dbpedia.org/ontology/wikiPageLength10324
http://dbpedia.org/ontology/wikiPageRevisionID10324
http://dbpedia.org/ontology/birthPlace8391
http://dbpedia.org/ontology/almaMater6543
http://dbpedia.org/ontology/knownFor6429
http://dbpedia.org/ontology/award6153
http://dbpedia.org/ontology/academicDiscipline5571
http://dbpedia.org/ontology/birthDate5317
http://dbpedia.org/ontology/thumbnail5227
http://dbpedia.org/ontology/institution4999
http://dbpedia.org/ontology/deathPlace4120
http://dbpedia.org/ontology/deathDate3171
http://dbpedia.org/ontology/doctoralStudent1935
http://dbpedia.org/ontology/doctoralAdvisor1792
http://dbpedia.org/ontology/nationality1665
http://dbpedia.org/ontology/occupation1608
http://dbpedia.org/ontology/birthName1243
http://dbpedia.org/ontology/birthYear943
http://dbpedia.org/ontology/deathYear642
http://dbpedia.org/ontology/residence598
http://dbpedia.org/ontology/influencedBy527
http://dbpedia.org/ontology/termPeriod504
http://dbpedia.org/ontology/citizenship457
http://dbpedia.org/ontology/education438
http://dbpedia.org/ontology/academicAdvisor423
http://dbpedia.org/ontology/notableStudent394
\n", + "
" + ], + "text/plain": [ + " eff\n", + "p \n", + "http://dbpedia.org/ontology/wikiPageWikiLink 441882\n", + "http://dbpedia.org/ontology/abstract 56882\n", + "http://dbpedia.org/ontology/wikiPageExternalLink 31014\n", + "http://dbpedia.org/ontology/wikiPageID 10324\n", + "http://dbpedia.org/ontology/wikiPageLength 10324\n", + "http://dbpedia.org/ontology/wikiPageRevisionID 10324\n", + "http://dbpedia.org/ontology/birthPlace 8391\n", + "http://dbpedia.org/ontology/almaMater 6543\n", + "http://dbpedia.org/ontology/knownFor 6429\n", + "http://dbpedia.org/ontology/award 6153\n", + "http://dbpedia.org/ontology/academicDiscipline 5571\n", + "http://dbpedia.org/ontology/birthDate 5317\n", + "http://dbpedia.org/ontology/thumbnail 5227\n", + "http://dbpedia.org/ontology/institution 4999\n", + "http://dbpedia.org/ontology/deathPlace 4120\n", + "http://dbpedia.org/ontology/deathDate 3171\n", + "http://dbpedia.org/ontology/doctoralStudent 1935\n", + "http://dbpedia.org/ontology/doctoralAdvisor 1792\n", + "http://dbpedia.org/ontology/nationality 1665\n", + "http://dbpedia.org/ontology/occupation 1608\n", + "http://dbpedia.org/ontology/birthName 1243\n", + "http://dbpedia.org/ontology/birthYear 943\n", + "http://dbpedia.org/ontology/deathYear 642\n", + "http://dbpedia.org/ontology/residence 598\n", + "http://dbpedia.org/ontology/influencedBy 527\n", + "http://dbpedia.org/ontology/termPeriod 504\n", + "http://dbpedia.org/ontology/citizenship 457\n", + "http://dbpedia.org/ontology/education 438\n", + "http://dbpedia.org/ontology/academicAdvisor 423\n", + "http://dbpedia.org/ontology/notableStudent 394" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ldf = df_lf.groupby(by='p').sum().sort_values(by='eff', ascending=False)\n", + "ldf.head(30)\n" + ] + }, + { + "cell_type": "markdown", + "id": "3def96ad", + "metadata": {}, + "source": [ + "## Récupérer les universités des études" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "db392096", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "300\n" + ] + }, + { + "data": { + "text/plain": [ + "(['{BIND( as ?wd_item). dbo:almaMater ?am} UNION ',\n", + " '{BIND( as ?wd_item). dbo:almaMater ?am} UNION ',\n", + " '{BIND( as ?wd_item). dbo:almaMater ?am} UNION '],\n", + " '{BIND( as ?wd_item). dbo:almaMater ?am}')" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Préparer la liste des individus tout en ajoutant l'URI wikidata\n", + "lq = []\n", + "i = 1\n", + "\n", + "la = l1[1000:1300]\n", + "ll = len(la)\n", + "\n", + "for e in la:\n", + " if i < ll:\n", + " lq.append('{BIND( <'+ e[1]+ '> as ?wd_item). <' + e[0] + '> dbo:almaMater ?am} UNION ')\n", + " i += 1\n", + " else:\n", + " lq.append('{BIND( <'+ e[1]+ '> as ?wd_item). <' + e[0] + '> dbo:almaMater ?am}')\n", + "\n", + "\n", + "print(len(lq))\n", + "lq[:3], lq[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "5fe4b31b", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{BIND( as ?wd_item). dbo:almaMater ?am} UNION {BIND( as ?wd_item). dbo:almaMater ?am} UNION {BIND( as ?wd_item). dbo:almaMater ?am} UNION {BIND( as ?wd_item). \n", + "PREFIX rdfs: \n", + "PREFIX dbp: \n", + "PREFIX dbo: \n", + "\n", + "SELECT ?wd_item ?am\n", + "WHERE {\n", + " \n", + " {\n", + " \n", + "\"\"\" + jlq + \"\"\"\n", + "\n", + " }\n", + "\n", + "}\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "id": "7ca0839d", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "### Executer la requête avec les fonctions de la librairie locale\n", + "qr = spqf.get_json_sparql_result(endpoint_dbp,q2)" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "id": "d5aef58b", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "106\n", + "[['http://www.wikidata.org/entity/Q505197',\n", + " 'http://dbpedia.org/resource/Rensselaer_Polytechnic_Institute'],\n", + " ['http://www.wikidata.org/entity/Q15429143',\n", + " 'http://dbpedia.org/resource/Yale_University'],\n", + " ['http://www.wikidata.org/entity/Q128823',\n", + " 'http://dbpedia.org/resource/University_College_London'],\n", + " ['http://www.wikidata.org/entity/Q128823',\n", + " 'http://dbpedia.org/resource/Royal_College_of_Science'],\n", + " ['http://www.wikidata.org/entity/Q64481',\n", + " 'http://dbpedia.org/resource/University_of_Leipzig'],\n", + " ['http://www.wikidata.org/entity/Q84783',\n", + " 'http://dbpedia.org/resource/University_of_Vienna'],\n", + " ['http://www.wikidata.org/entity/Q1627897',\n", + " 'http://dbpedia.org/resource/Trinity_College,_Cambridge'],\n", + " ['http://www.wikidata.org/entity/Q446620',\n", + " 'http://dbpedia.org/resource/Yale_College'],\n", + " ['http://www.wikidata.org/entity/Q5539344',\n", + " 'http://dbpedia.org/resource/University_of_St_Andrews'],\n", + " ['http://www.wikidata.org/entity/Q5539344',\n", + " \"http://dbpedia.org/resource/Christ's_College,_Cambridge\"]]\n" + ] + } + ], + "source": [ + "r = [l[:3] for l in spqf.sparql_result_to_list(qr)]\n", + "print(len(r))\n", + "import pprint\n", + "pprint.pprint(r[:10])" + ] + }, + { + "cell_type": "markdown", + "id": "40c4c5cd", + "metadata": {}, + "source": [ + "### Traitement de l'ensemble" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "id": "d96aa3aa", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "300 600\n", + "600 900\n", + "900 1200\n", + "1200 1500\n", + "1500 1800\n", + "1800 2100\n", + "2100 2400\n", + "2400 2700\n", + "2700 3000\n", + "3000 3300\n", + "3300 3600\n", + "3600 3900\n", + "3900 4200\n", + "4200 4500\n", + "4500 4800\n", + "4800 5100\n", + "5100 5400\n", + "5400 5700\n", + "5700 6000\n", + "6000 6300\n", + "6300 6600\n", + "6600 6900\n", + "6900 7200\n", + "7200 7500\n", + "7500 7800\n", + "7800 8100\n", + "8100 8400\n", + "8400 8700\n", + "8700 9000\n", + "9000 9300\n", + "9300 9600\n", + "9600 9900\n", + "9900 10200\n", + "10200 10500\n", + "10500 10800\n", + "10800 11100\n", + "11100 11400\n", + "11400 11700\n", + "11700 12000\n" + ] + } + ], + "source": [ + "ia = 0\n", + "ib = 300\n", + "\n", + "l_final = []\n", + "\n", + "while ib < len(l1):\n", + " lq = []\n", + " i = 1\n", + "\n", + " la = l1[ia:ib]\n", + " ll = len(la)\n", + "\n", + " for e in la:\n", + " if i < ll:\n", + " lq.append('{BIND( <'+ e[1]+ '> as ?wd_item). <' + e[0] + '> dbo:almaMater ?am} UNION ')\n", + " i += 1\n", + " else:\n", + " lq.append('{BIND( <'+ e[1]+ '> as ?wd_item). <' + e[0] + '> dbo:almaMater ?am}')\n", + "\n", + " jlq = ''.join(lq)\n", + " q2 = \"\"\"\n", + " PREFIX dbr: \n", + " PREFIX rdfs: \n", + " PREFIX dbp: \n", + " PREFIX dbo: \n", + "\n", + " SELECT ?wd_item ?am ?am_label ?geo_point\n", + " WHERE {\n", + " \n", + " {\n", + " \n", + " \"\"\" + jlq + \"\"\"\n", + "\n", + " }\n", + "\n", + " {?am rdfs:label ?am_label.\n", + " FILTER(lang(?am_label)=\"en\")}\n", + " OPTIONAL {?am ?geo_point}\n", + "\n", + " }\n", + " \"\"\"\n", + "\n", + " qr = spqf.get_json_sparql_result(endpoint_dbp,q2)\n", + " r = [l for l in spqf.sparql_result_to_list(qr)]\n", + " l_final += r\n", + "\n", + " ia += 300\n", + " ib += 300\n", + " print(ib-300, ib)" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "id": "9cf9cf6e", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6532\n" + ] + }, + { + "data": { + "text/plain": [ + "[['http://www.wikidata.org/entity/Q168468',\n", + " 'http://dbpedia.org/resource/University_of_Edinburgh',\n", + " 'University of Edinburgh',\n", + " '55.94738888888889 -3.1871944444444447'],\n", + " ['http://www.wikidata.org/entity/Q302590',\n", + " 'http://dbpedia.org/resource/Charles_University',\n", + " 'Charles University',\n", + " '50.0884 14.4037'],\n", + " ['http://www.wikidata.org/entity/Q373799',\n", + " 'http://dbpedia.org/resource/École_Polytechnique',\n", + " 'École Polytechnique',\n", + " '']]" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(len(l_final))\n", + "l_final[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "id": "8e1e54ff", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 6532 entries, 0 to 6531\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 wd_uri 6532 non-null object\n", + " 1 almaM_uri 6532 non-null object\n", + " 2 label 6532 non-null object\n", + " 3 geocoord 6532 non-null object\n", + "dtypes: object(4)\n", + "memory usage: 204.2+ KB\n" + ] + } + ], + "source": [ + "df_almaM = pd.DataFrame(l_final, columns=['wd_uri', 'almaM_uri', 'label', 'geocoord'])\n", + "df_almaM.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "e909e536", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
wd_urialmaM_urilabelgeocoord
0http://www.wikidata.org/entity/Q168468http://dbpedia.org/resource/University_of_Edin...University of Edinburgh55.94738888888889 -3.1871944444444447
1http://www.wikidata.org/entity/Q302590http://dbpedia.org/resource/Charles_UniversityCharles University50.0884 14.4037
2http://www.wikidata.org/entity/Q373799http://dbpedia.org/resource/École_PolytechniqueÉcole Polytechnique
\n", + "
" + ], + "text/plain": [ + " wd_uri \\\n", + "0 http://www.wikidata.org/entity/Q168468 \n", + "1 http://www.wikidata.org/entity/Q302590 \n", + "2 http://www.wikidata.org/entity/Q373799 \n", + "\n", + " almaM_uri label \\\n", + "0 http://dbpedia.org/resource/University_of_Edin... University of Edinburgh \n", + "1 http://dbpedia.org/resource/Charles_University Charles University \n", + "2 http://dbpedia.org/resource/École_Polytechnique École Polytechnique \n", + "\n", + " geocoord \n", + "0 55.94738888888889 -3.1871944444444447 \n", + "1 50.0884 14.4037 \n", + "2 " + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_almaM.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92d75e41", + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "try:\n", + " df_almaM.to_sql(con=cn, name='wdt_alma_mater_from_dbpedia', if_exists='fail')\n", + "except Exception as e:\n", + " print('Error:', e) " + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "2d78978e-bc7d-4890-8daa-42e47f5d7a49", + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "data": { + "text/plain": [ + "label\n", + "Harvard University 195\n", + "University of Cambridge 181\n", + "Massachusetts Institute of Technology 164\n", + "Princeton University 143\n", + "University of California, Berkeley 121\n", + "California Institute of Technology 118\n", + "University of Chicago 113\n", + "Cornell University 108\n", + "Columbia University 97\n", + "Moscow State University 81\n", + "dtype: int64" + ] + }, + "execution_count": 158, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_count= df_almaM.groupby(by='label').size().sort_values(ascending=False)\n", + "df_count.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c921330", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Py3.10 sparql", + "language": "python", + "name": "py310_sparql" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}