From f40621d1080629bb76411abd5c5d48d3a9bdb12a Mon Sep 17 00:00:00 2001 From: Francesco Beretta Date: Mon, 6 May 2024 10:12:28 +0200 Subject: [PATCH] codas des occupation --- Wikidata/codage_occupations.sql | 160 ++++++++++++++++++ data/astronomers_import.db | Bin 21127168 -> 21127168 bytes .../wdt_occupations_exploration.ipynb | 4 +- .../wdt_occupations_production.ipynb | 6 +- 4 files changed, 168 insertions(+), 2 deletions(-) create mode 100644 Wikidata/codage_occupations.sql diff --git a/Wikidata/codage_occupations.sql b/Wikidata/codage_occupations.sql new file mode 100644 index 0000000..8ecc334 --- /dev/null +++ b/Wikidata/codage_occupations.sql @@ -0,0 +1,160 @@ +/*** Codage des occupations + * + * Dans ce document sont présentés différents scripts SQL qui illustrent + * le processus de codage des occupations + * + */ + +-- compter les relations personne-occupations +SELECT count(*) +FROM wdt_person_occupation wpo; + +-- regrouper par occupation avec tri par effectif déscendant +SELECT occupationUri, occupationLabel, COUNT(*) as effectif +FROM wdt_person_occupation +GROUP BY occupationUri , occupationLabel +ORDER BY effectif DESC; + + +-- distribution des effectifs par plages +WITH tw1 AS ( +SELECT occupationUri, occupationLabel, COUNT(*) as effectif +FROM wdt_person_occupation +GROUP BY occupationUri, occupationLabel ), +tw2 AS ( +SELECT 1001 a, 30000 b +UNION +SELECT 501 a, 1000 b +UNION +SELECT 101 a, 500 b +UNION +SELECT 51 a, 100 b +UNION +SELECT 21 a, 50 b +UNION +SELECT 11 a, 20 b +UNION +SELECT 5 a, 10 b +UNION +SELECT 2 a, 4 b +UNION +SELECT 0 a, 1 b +) +SELECT CAST(a as 'str') || '-' || CAST(b as 'str') as plage, count(*) as effectif, SUM(effectif) as sum , group_concat(occupationLabel, '; ') +FROM tw1 JOIN tw2 ON tw1.effectif BETWEEN tw2.a AND tw2.b +GROUP BY plage +ORDER BY effectif DESC; + +-- inspecter les effectifs d'activités par personne +SELECT wp.personUri, wp.personLabel, count(*) as effectif, + min(wp.birthYear) birthYear, + group_concat(distinct occupationLabel) occupationLabels +FROM wdt_person_occupation wpo, wdt_personne wp +WHERE wp.personUri = wpo.personUri +GROUP BY wp.personUri, wp.personLabel +ORDER BY effectif DESC; + + +-- Vérifier qu'il n'y a pas de doublons de personnnes +WITH tw1 as ( +SELECT wp.personUri, wp.personLabel, count(*) as effectif, + min(wp.birthYear) birthYear, + group_concat(distinct occupationLabel) occupationLabels +FROM wdt_person_occupation wpo, wdt_personne wp +WHERE wp.personUri = wpo.personUri +GROUP BY wp.personUri, wp.personLabel +) +SELECT * +FROM tw1 +GROUP BY personUri, personLabel +HAVING COUNT(*) > 1; +LIMIT 10; + +-- nombre de personnes par effectif +WITH tw1 as ( +SELECT wp.personUri, wp.personLabel, count(*) as effectif, + min(wp.birthYear) birthYear, + group_concat(distinct occupationLabel) occupationLabels +FROM wdt_person_occupation wpo, wdt_personne wp +WHERE wp.personUri = wpo.personUri +GROUP BY wp.personUri, wp.personLabel +) +SELECT effectif AS eff_activite, count(*) AS effectif_eff +FROM tw1 +GROUP BY effectif +ORDER BY effectif_eff DESC; + + +/*** CODAGE + * + * Après avoir créé et alimenté une table occupation, + * afin de disposer d'une seule ligne identifiant une occupation, + * on crée une table "occupation_domain" qui représente les domaines des occupations + * et qui est associée dans une relation de 1 à n à la table occupation. + * On crée ensuite une relation de clé étrangère dans la base de données + * et on peut coder avec la requête suivante dans DBeaver + */ + + +-- IMPORTANT : requête permettant l'association aux domaines. i.e. le codage, +-- dans un logiciel avec interface graphique +WITH TW1 AS ( +SELECT occupationUri, occupationLabel, COUNT(*) as effectif +FROM wdt_person_occupation +GROUP BY occupationUri , occupationLabel) +SELECT wo.pk_wdt_occupation, occupationUri, occupationLabel, effectif, fk_domain +FROM tw1, wdt_occupation wo +WHERE tw1.occupationUri = wo.wdt_uri +ORDER BY effectif DESC; + + + +-- inspecter les codages +SELECT wp.personUri, wp.personLabel, occupationLabel, od.label +FROM wdt_person_occupation po + JOIN wdt_occupation wo ON po.occupationUri = wo.wdt_uri + JOIN wdt_personne wp ON wp.personUri = po.personUri + LEFT JOIN occupation_domain od ON od.pk_occupation_domain = wo.fk_domain +--WHERE od.label IS NULL +LIMIT 20; + + +-- inspecter les personnes +SELECT wp.personUri, wp.personLabel, count(*) AS eff, GROUP_CONCAT(occupationLabel) occupations, + GROUP_CONCAT(od.label) domaines +FROM wdt_person_occupation po + JOIN wdt_occupation wo ON po.occupationUri = wo.wdt_uri + JOIN wdt_personne wp ON wp.personUri = po.personUri + LEFT JOIN occupation_domain od ON od.pk_occupation_domain = wo.fk_domain +GROUP BY wp.personUri, wp.personLabel +--HAVING COUNT(od.label) = 1 +ORDER BY eff DESC +LIMIT 100; + + +WTIH tw1 AS ( +SELECT DISTINCT wp.personUri, wp.personLabel, od.label +FROM wdt_person_occupation po + JOIN wdt_occupation wo ON po.occupationUri = wo.wdt_uri + JOIN wdt_personne wp ON wp.personUri = po.personUri + LEFT JOIN occupation_domain od ON od.pk_occupation_domain = wo.fk_domain + ORDER BY wp.personUri, od.label) +SELECT wp.personUri, wp.personLabel, count(*), group_concat(od.label) +FROM tw1 +GROUP BY wp.personUri, wp.personLabel; + + + + +-- regrouper par effectifs de domaines +WITH tw1 AS ( +SELECT wp.personUri, wp.personLabel, count(*) AS eff, GROUP_CONCAT(od.label) domaines +FROM wdt_person_occupation po + JOIN wdt_occupation wo ON po.occupationUri = wo.wdt_uri + JOIN wdt_personne wp ON wp.personUri = po.personUri + LEFT JOIN occupation_domain od ON od.pk_occupation_domain = wo.fk_domain +GROUP BY wp.personUri, wp.personLabel) +SELECT * +FROM tw1 +GROUP BY domaines; + diff --git a/data/astronomers_import.db b/data/astronomers_import.db index 5de1163ff306552daf88a9b65d5529682f11773a..73914ba0397e17a905b5c820cb2e91c48dde12e7 100644 GIT binary patch delta 5699 zcmY+|34B!5y$A4f?!L{vfrPNb#6SW8LiQb!fXo0QTM-b95JG@~ERaP6W+s`05LpC+ zl>=%!p=uR))u4p|B~6f2;E52_)}>a!x==+FEl?5R{ce)J_j&nzz6WNzcg{Kg^FRMH zwe6d=T6kihV>=-cF@z{lgb3rCMx=||N&KRL2bVTar{VSDj7D(=1rB%+Km-Y7P(TF@ zbTGgHCRh*#y`VRQLj?4JNazdwpg#PKbhgAR30jaEO6e7y%<84o1Of z7z6Pz77`#4k{}sUAQjRe9Wo#jvLG9BU>uBxT*!lbm;e*u-jFzB(%p75!rDv;F=kmm zTi314p|CL5I!n+a10&gvp{^x0y_N)~VoV3ERWPbWIS%O<$sy-Caf~EM6NJ~LjmjUm zVN|ppqOS>k`8C|Ht!VR_*=p9A_t6~jUE_Jv;4}FBGdONK!!A^jG|8s+!baBK^rcx{LG0u)FJvMMFtdBr=dJq^k=( zY_B+A7>nPwP}by}aEXi}57Hub_KGk5w|6qAq$=ryB@%@v*#%acpc=`VeKkD1bk;js z6|#m%WH_$GuD$O|R@GElB7^N$gJWIs)C_exT?Q}b*iX)*?EEXf5zM+ZTcmQR*D#lp zDtacxTaSmjbSy*=7^e^mX_XiXTo|t>mSLk4)M#OcVb-!s2taQeb)YYM;u!`xm=_g zdu606UC}R-bljQz-z#izoXam5L6X9Pajw@Kj-^CQ&hBFm7rL~1j4MSovZ5psOBNE= zbk3K;I!C$Og2l*7Z2M@JhsBL?Wh>(%C1TIzuD$*=jM}9!u3l8hRJ#V9u=AgJlLbPE zr}3GwuFx}M-K!F6wi4^l)(&g7C6J%Zx6Cc%&*luXpW{QvvyMfMXya?6#aL@(>bJQ} z{Wv|UZ`Iv;Z|xK9MXgjzRTBww^tzf!GnBi^QKgnvE13$FJLFCBRJphGp|nR@ zEX9hK#C>9k7%$uqS_Qw5&Huu;^XvE=F5*Y-5Vw}32Rz-AS6o?HxQ-RxtR2q2KekkI z2+4uj-IKAjyog zqCJ3XcK(|bHdwuksaU6G zDL*UiN{uo}QRPmVkyiO3IYPQ9?UyQ~1o5)?vREbN(_^A0oDp^k_X)lDPxzhud_Ia3 z{>km**sO%eA#8E87^$Y!rP_0X6%^Ly9jgi1b-lVrgAJbSOVPB6bt5E_Y7g$gOTHY< z$gMLZGRA(qWt1_&Q&wEb&fjpwGUw=GQ47zgv+M!GwG&VJ5<|CJ z+$OIk)a6Pf+>Y!xzC81d;6%zTM_#=ER-OP{Yd9IbK;ar?S&tbjTf7x{fcGMKmq7{fy=jxKe zb;WF2KmSP0%B))?k!ZU!+rM!qX;y9>0;+2u%0BXCXuUJ*9_k6WrUL)y;JNWgl$tTI z9=XIGzTo*m@w5-?H_8`F9id$p-3MuC+h^_;G32}KR(RI_jXS-u{E8bU!xWeb_rWx{ zAEv_$m{(F^Do`W~`hpQ{h$3bcP{2ecJh zf_hDTPAyhN<(T4E^5xs|33(%Tk32;-q<5t!qy^F-nn(|dAB(%ig<_QOH=&tU3WY)p z|0Vw-`GNQFVcdCcH=ETg#;B9(kvKE$$ei`>HIdAh;&w7qx~*7zPQBZX(S@|ExVoa8 z<*&lpUG&mCq>kMBEQyS@C-vKn-V9C6uAk67|BTjqQ*g;X%R4f6VXZ=kSr-MeapzC6`FAQtnmE!%Qo-X{H0?QDrZk z2Wc_8_O(Aj6Y}e4_9Wxuu^Yo#=_pqeb3Xo_MAcC1#oFh^&@*4w4p&0_ceOmFCzcIwfD>ce&g87JU}CN&8fLK`YTlaVOMcoJ-AD z9rQhZjnb@?aJqb5ZpHpO6J2~?3QEhQSUyEOBzpN}CN1IWdnaU!Uuvg5teROjz3Hl&-IFwfr2`^q zibG9bVqXK5Wc88~c4^k2;q1GPMY3w8E$G>55>H_PgsuEp!tPt$z2Hi*d0-xnANP+k zl2cQ()D`x7anJTzU%V=hpK5=1tftY<>Z)d`%F2hk4zN8=SB9XZkZd+2 z##I(P6A?*Q%h;W|O3_7re=^nD*7k{{4o*=%AQ`A%Nmq}8$NEOnRf_5(X}BY*HQX^+ z&<~#$)E`N1@I3xoVd#BG20BIlf2V>Q`bW~Q1^pDs33|}`R}@Po>DW)Pz5S4qT1c&8}i=Gf&l7Wj8(eCch6@Z10nhX#_1-VwWZ89{4kB+b^7kh4%zlcgAy#Hag4 z2V3y;+>q2Bpn3G(LrR@YX;xzxt%G{l2o0eyy6M^-OKTwE>K)#C%)DhjW#&B<);K^= z?`U};+EU-C_Q~rlkg-KGR}@y178ZGmM>~sFcuLt$t#QnhiQxhnykSyNd0Ba>r#Qc; z%JYb)>cl>0iRZ+=V&{owXG!ddeLb&^cCM}{tn#d>wqGi*cAnT*RlU@HpV3CL*k-Nq zR%DhbkTDO=EHClc@57yzR+M|H*iOAMCWP9iiVHW_5=U(LwBo9&isH&tK_CMQ*+R2% zQ2)`+mF3kHWrZc4%AU^_R~2EPhOiB09yI~JJ4bQ-{ z@EiCo1fd!B!tdZYcphGW-@}XW66}NhZ~*=b{s1q-E6@^ZHxK6J*Eq--l0>!xEAsDUkDMU=TM9`v(o`v2{EPUsxJZn)-m-RD^R2$-=jJPBg_&zo$5DsZ;ieG| zg={n4GoCgcF#75r>r5}uBav&GHIJ5}-csAtI^-Hf=@dQ|oL`jf4l4S#nwdBjAYHLzLMCamUd@yjHL#d9FXIEua`+Hv! z<=RHz*yW^ZYJls`0l`_lBWXOf+G_A`8CB9Y^yH|fPmr}+`plQ87956eQ5l+kB#k&VQ~lNq(PKA9quf0LB3yBq|?&l(u}SO^ht4^I6%0F!CoN@dxw2{uDeVg+8s^c0^A&;`tYs&||G|iFP&~?H@@|$+AFCNw7 z-r9~AXsDsCV2@AoC}8m0%y-4QJbT`$uS8 zM#KF*)lt-GZz8t%*PA<5D0pw7MAGd4UImJNzl3wwF8H zi8VWjb;8V849fkD(T8y4cYWy8lwg zHKQ?bwFCYPZ^2181*hS^;cYkr@4#6&2c2*p-i7zzefSIf6+VCu;UoAM{sy1G-{DiZ z02kpBd z;11l~8n{Y#fZcF*!`qEuH=^B0b|c%3VmGSYXm+F9jbS$qyD{y?vYRlw>18*)?IzrA zB4}ISY9Cq~HAQ=YL@6A>dAiAZ*orm3F%OyR%`C?c+I}R$iN;Oih~Y=UbxVH}KWaRn z57s_YIBhwS;E$+tE7UaQ7u1shrE6zABCFW})C>6Wg|gOp{uFAkQ!Rm3wxcDG8EQBf zD5s|54I+}%01C&h;-tl>8Nu3`S4or)CB7E;h_ioqNoVHaK(ab1kkoS?xi;d%$X$kS z1Zv604hP<%+ntfn7y3be7ytud5DbPPFch2+1@}NS41?hi1FKF#rGn delta 5020 zcmZA4cX$-l-Usk=ruRuGAxIzzC84EfHw{Pub|a!BAWcL<2@=2rQb1rg*+~eY35Iq= z(12p$y-a zO!0x2yXct+b%;VpibROZtnZz|XQJ4tsBzksr*OL(encHV0s{uHfCD@bfCwZY0|lr+ z13EB(34%Z{Xb0MZ4xl3l0i8f7=nNhMT|gM<3c7)C5CI}VchCd$1ie5Mhz7kuAJ7-{ z1N}h^hy`&V9wdN7kOY!}12{nnNCjyi9b|w3U?3O-GC>x|2036b7-I7yhCXQ4+MBBp zLR!B0r}?YhF37dOtk083eYmTjN(!@vArw5tlHiF>9je3khN6v#eS=%X*=!v5hk42* z=43O<_?cOWuVUFaXw(^_j8OfGzKwJ1GxWar?@YM%I~S}S)s|}`wN7d)u29!8chr%p zsGMPEDw~w&l&(uGoexiwrE1g1uHLmzD(wjeS-zd9BfMN=XCgrN<(2cBi*@ zL)&j1F)t;hKfE>~erIJ2S?$ot*BG4|KJUEg3ahMo!cYQ64_C-?fg zQWQsD0fnRSXkiJ7Z1DCb1-T_Wnc2@(DJ4eJ^T#97$a8Er19n*~%9Db{ z&%{mQXfaIqQdlEABXot1f0LiZCvZP=2RR>?&N1w9wvHXahA^Kq>zFZ2XZ#6XkM-f$ zK&_7k2SRq*6t`QDv#e49CD0!{_mzLOCr2(4&JDFL}@``ViFyAt(`FTx&H z;%>^SI)+TYS!d>;W2R|*VyrhN7(Mhab)t{a!?erVTiQ%?UyD(HQTM6~)NEB&T9s;L zn9^Q8FW1Y@$i1Xn(tc^alq%jAo5X6-CF;Vb!e(Kz(3$^^-^tJC9o%(p54V7GaSD5m zUC&NnJ1`fRx0ne`B)*3E-MAE|lKd@vs{Kc!JC_YgtA<(92S(MDin5~O!ZLF9Hx~>9 zXDM%I?{kk~WJk5*;gm!q$nS}>iS0hZusPMe3I4IA1My$+#>(l{Fy^D_D4LYvj%moW zJ7LkTKjijv@KJE!qX)a($t2J3-X#yJ)&vwopU(1r-qiigZXp71AmUf<@G31?{atNV zYtPN@{ZaFZrp_pzvB183#r+1B zb7}+uIiP_JXTxG~M-tl48*Tf(aqr@x?J$@gw!QdEZ>k;tgIi+d)apqB>Op`1rtd2g zN$OAT6!_s219LJvw9+9}KN}cG^%E=mH=G#+ld`VlCpX9ikAq?02{0Ty37!HYKpw~k zBf%)}G#CxWfU#g47!RHS6Tq`zB6to=0tH|)m;$DPX`m2H2hW2Upa{$av%qZd0w@M0 zU=ElINTWGb0K z_yInOYj9umBk{FX#@nw{drWL()VS%`oq&o9D@sd9_AGB761T<@$vZWkKjie9oInr6 z<#-+T*2SJhSc$L65Kt!=?vu&cHl-)oIj|NwwFZ`fINE>dmM7H?TjH64<&>HvT94si z+w{r=dvTp-6AK4>(rdyZ5`Wv(pGjgIO3J99&bH`yIQ4B?(c zHm>jtl+$X$=`e#jl69>zNlB@J!XFM_t{{==mA&hvl^);)m7vO&R{H)LRds}x3d_tk zS)&;pF=~tqgV&GgmOfY)wNu(^ZH#89=hY4B7&TP6sBBZ7SNhBUf#*C&PLl4yY_VXr zurOPeiX+5O;i9k!%0y588sEq-;0N=FYvz`5d0Ys4iG7ou$aaCUyOEj7^u^clUOX4G z4)iB26b?K6if7x357LT)yh-CCOPed4b}3GjWd6fkV~#UB8Q&T^jZ!1i!1_Vmqq}uZJFYF!^0iR)oVs2e z2g}$+Wu5Y@(oOze-Yvf<50sJQmzGFRNg-mJ_?}oHrirX@LRczf3kv^Fehu&9MeY>0 zl6#yBW-qYY*x76X^9!?|sbO*$#)QFJi&o>|SVkwHEe-cWLvR|Y&u={9cXkfJCD=YP zs&P8@t6?E{7v_?!B%#6IH3TQIL##xyYKJR?#IG(FHfY{H{)}!R_&By($2F#NVkcUR zU|6-D^g87MR!Cq^bT+rj3F{v03g{x)uyt)u+@8eeH%7za)giEWMRu-=BIfiPg6)~u zxRkNEW{b_jL-$G5BE}(gLAV;!fW@HJ4#G=r|7EHNP&=g-nM;kk#(ZP+`k=bUutFx^ zYU$|@DoDakg-dP6A+k)VyK5#H910(wS2DY(yl8GkA=#+bMTwdpDUm}5+kUmKoXLe} z(LK6kcwu>YX<=DJTts$hdG^%uqIpH-A8e0+^U6Pn;5gJJCGd^^_u2N+$wj4w7Gim_ z$f$d)UuogfB^Bj`5z|XbUnq>27F}3eR#^0Wap4Pv#pMwd#YOW9OUsJN={f!DnvFWJ z6f6VF!3wYvyaZN()!=1N57vOSU>#TwHUI)%0UN=qU=!F3UIVX#2CxNe1#f^i!8Y&~ zcpJO}-UZvi|A4=N_rTx54zLsKvYU;@tZb6_#VZL9AMhD-Lu->zCloY--Cz&c3-*Ei zzy=4vL2w8h22J1ycpvydGdK#4fe+ph3R)uYRw&vn?o@HO81isdjuArnb3Eang<1Ix zx1F2KCBUM(gPqT2GJi6Drj~Ir4E`9`<9r;9E|R>0@Gf>ItK}^$52($g`%kEwyFT|i z?WXFM4>>ux_ThdR>gMnv-dMY)uB9fl>JT!U%thuPOky}vSrpw)>AEZXK`l}Ksg9dWN%COU=R z1zVUWWbpU+Bm6u*jl0e5=H_!5?0xnSR6{4rFds5Ym`p~%$M6#TBo0C6Q9aUN4*lOB zQLnVrYROCBC5mq9Uzl7>8i$4VB$)|SqW$ufmfM<~wFEX_4yYLobN|iZ4&?rKl~Hxp zjTUem`~!RlPJol(Bk)h~F*pTIgEOENoCTkNbKq0(88{C<2Va0M!3A&;Tmt_Bmq8o& z3S0qSgKxmM;5%>?d=LH&egN0NkKiY89ozsn!7Xqb{0#mBegVIN-@xzS4)_Dy1%HBj z;6C^ZJXm4fz^efbh6YQ6qruY%7q9G-IwIOHk()2R14k)AMRs) zZ^P@(q|uELSWdPWy8owpmmfT_wmqh?7)y>tfw$B8OP)R?^X^!IOdQu(M%wZk9mKQG znj=;O_Fe<_Sjl$ueoJR0=d8dB?t^ZX@nrS^YnGHeBXI3zpEs812dz2Wz-jQ-ZXL8r zH7Cr-a2UpODyGaPSMtMpk((`(MAGk&bxlhD@9uilZY!DOAGYQ*j%Ub@<(^nr)0T?l z!C`BWQ! zF26NEa^^mIH6+1ze0?^YEECm4*l^O9`)Q?