Skip to content

Commit

Permalink
codas des occupation
Browse files Browse the repository at this point in the history
  • Loading branch information
atterebf committed May 6, 2024
1 parent 248fe3c commit f40621d
Show file tree
Hide file tree
Showing 4 changed files with 168 additions and 2 deletions.
160 changes: 160 additions & 0 deletions Wikidata/codage_occupations.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
/*** Codage des occupations
*
* Dans ce document sont présentés différents scripts SQL qui illustrent
* le processus de codage des occupations
*
*/

-- compter les relations personne-occupations
SELECT count(*)
FROM wdt_person_occupation wpo;

-- regrouper par occupation avec tri par effectif déscendant
SELECT occupationUri, occupationLabel, COUNT(*) as effectif
FROM wdt_person_occupation
GROUP BY occupationUri , occupationLabel
ORDER BY effectif DESC;


-- distribution des effectifs par plages
WITH tw1 AS (
SELECT occupationUri, occupationLabel, COUNT(*) as effectif
FROM wdt_person_occupation
GROUP BY occupationUri, occupationLabel ),
tw2 AS (
SELECT 1001 a, 30000 b
UNION
SELECT 501 a, 1000 b
UNION
SELECT 101 a, 500 b
UNION
SELECT 51 a, 100 b
UNION
SELECT 21 a, 50 b
UNION
SELECT 11 a, 20 b
UNION
SELECT 5 a, 10 b
UNION
SELECT 2 a, 4 b
UNION
SELECT 0 a, 1 b
)
SELECT CAST(a as 'str') || '-' || CAST(b as 'str') as plage, count(*) as effectif, SUM(effectif) as sum , group_concat(occupationLabel, '; ')
FROM tw1 JOIN tw2 ON tw1.effectif BETWEEN tw2.a AND tw2.b
GROUP BY plage
ORDER BY effectif DESC;

-- inspecter les effectifs d'activités par personne
SELECT wp.personUri, wp.personLabel, count(*) as effectif,
min(wp.birthYear) birthYear,
group_concat(distinct occupationLabel) occupationLabels
FROM wdt_person_occupation wpo, wdt_personne wp
WHERE wp.personUri = wpo.personUri
GROUP BY wp.personUri, wp.personLabel
ORDER BY effectif DESC;


-- Vérifier qu'il n'y a pas de doublons de personnnes
WITH tw1 as (
SELECT wp.personUri, wp.personLabel, count(*) as effectif,
min(wp.birthYear) birthYear,
group_concat(distinct occupationLabel) occupationLabels
FROM wdt_person_occupation wpo, wdt_personne wp
WHERE wp.personUri = wpo.personUri
GROUP BY wp.personUri, wp.personLabel
)
SELECT *
FROM tw1
GROUP BY personUri, personLabel
HAVING COUNT(*) > 1;
LIMIT 10;

-- nombre de personnes par effectif
WITH tw1 as (
SELECT wp.personUri, wp.personLabel, count(*) as effectif,
min(wp.birthYear) birthYear,
group_concat(distinct occupationLabel) occupationLabels
FROM wdt_person_occupation wpo, wdt_personne wp
WHERE wp.personUri = wpo.personUri
GROUP BY wp.personUri, wp.personLabel
)
SELECT effectif AS eff_activite, count(*) AS effectif_eff
FROM tw1
GROUP BY effectif
ORDER BY effectif_eff DESC;


/*** CODAGE
*
* Après avoir créé et alimenté une table occupation,
* afin de disposer d'une seule ligne identifiant une occupation,
* on crée une table "occupation_domain" qui représente les domaines des occupations
* et qui est associée dans une relation de 1 à n à la table occupation.
* On crée ensuite une relation de clé étrangère dans la base de données
* et on peut coder avec la requête suivante dans DBeaver
*/


-- IMPORTANT : requête permettant l'association aux domaines. i.e. le codage,
-- dans un logiciel avec interface graphique
WITH TW1 AS (
SELECT occupationUri, occupationLabel, COUNT(*) as effectif
FROM wdt_person_occupation
GROUP BY occupationUri , occupationLabel)
SELECT wo.pk_wdt_occupation, occupationUri, occupationLabel, effectif, fk_domain
FROM tw1, wdt_occupation wo
WHERE tw1.occupationUri = wo.wdt_uri
ORDER BY effectif DESC;



-- inspecter les codages
SELECT wp.personUri, wp.personLabel, occupationLabel, od.label
FROM wdt_person_occupation po
JOIN wdt_occupation wo ON po.occupationUri = wo.wdt_uri
JOIN wdt_personne wp ON wp.personUri = po.personUri
LEFT JOIN occupation_domain od ON od.pk_occupation_domain = wo.fk_domain
--WHERE od.label IS NULL
LIMIT 20;


-- inspecter les personnes
SELECT wp.personUri, wp.personLabel, count(*) AS eff, GROUP_CONCAT(occupationLabel) occupations,
GROUP_CONCAT(od.label) domaines
FROM wdt_person_occupation po
JOIN wdt_occupation wo ON po.occupationUri = wo.wdt_uri
JOIN wdt_personne wp ON wp.personUri = po.personUri
LEFT JOIN occupation_domain od ON od.pk_occupation_domain = wo.fk_domain
GROUP BY wp.personUri, wp.personLabel
--HAVING COUNT(od.label) = 1
ORDER BY eff DESC
LIMIT 100;


WTIH tw1 AS (
SELECT DISTINCT wp.personUri, wp.personLabel, od.label
FROM wdt_person_occupation po
JOIN wdt_occupation wo ON po.occupationUri = wo.wdt_uri
JOIN wdt_personne wp ON wp.personUri = po.personUri
LEFT JOIN occupation_domain od ON od.pk_occupation_domain = wo.fk_domain
ORDER BY wp.personUri, od.label)
SELECT wp.personUri, wp.personLabel, count(*), group_concat(od.label)
FROM tw1
GROUP BY wp.personUri, wp.personLabel;




-- regrouper par effectifs de domaines
WITH tw1 AS (
SELECT wp.personUri, wp.personLabel, count(*) AS eff, GROUP_CONCAT(od.label) domaines
FROM wdt_person_occupation po
JOIN wdt_occupation wo ON po.occupationUri = wo.wdt_uri
JOIN wdt_personne wp ON wp.personUri = po.personUri
LEFT JOIN occupation_domain od ON od.pk_occupation_domain = wo.fk_domain
GROUP BY wp.personUri, wp.personLabel)
SELECT *
FROM tw1
GROUP BY domaines;

Binary file modified data/astronomers_import.db
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
"cell_type": "code",
"execution_count": 2,
"id": "0133375f",
"metadata": {},
"metadata": {
"metadata": {}
},
"outputs": [
{
"data": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,11 @@
" WHERE name ='wdt_occupation';\n",
" \"\"\"\n",
"\n",
"cur = cn.cursor()\n",
"### ATTENTION : cette cellule contient le code nécessaire si on doit refaire la table\n",
"# généralement on ne doit pas l'exécuter, et si on veut vraiment l'exécuter \n",
"# il faut décommenter le code\n",
"\n",
"# cur = cn.cursor()\n",
"# r = cur.execute(q3)\n",
"# r = cur.execute(q4)\n",
"# cn.commit()"
Expand Down

0 comments on commit f40621d

Please sign in to comment.