Skip to content

Commit

Permalink
upload
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreiNesterov committed Aug 31, 2023
1 parent ae14c70 commit 5c6a611
Show file tree
Hide file tree
Showing 3 changed files with 276 additions and 0 deletions.
184 changes: 184 additions & 0 deletions sensitivity_markers/implicit/searching_implicit_markers.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import json\n",
"import csv\n",
"import pandas as pd\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"def get_lemma_by_term(query_term:str, lang:str) -> str:\n",
" '''\n",
" Getting a lemma of a query term\n",
" lang: str, 'en' or 'nl'\n",
" Returns str, 'not found' if lemma was not found\n",
" '''\n",
" \n",
" return_lemma = 'not found'\n",
" \n",
" # importing query terms with lemmas\n",
" # change path to GitHub\n",
" \n",
" with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:\n",
" query_terms = json.load(jf)\n",
" \n",
" for lemma, qt in query_terms[lang].items():\n",
" if query_term in qt:\n",
" return_lemma = lemma\n",
" \n",
" return return_lemma"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wikidata"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def search_implicit_markers_wd(lang:str):\n",
" '''\n",
" lang: str, \"en\" or \"nl\"\n",
" Returns pandas DataFrame\n",
" '''\n",
" \n",
" wd_implicit = pd.DataFrame(columns=[\"resource\",lang,\"lemma\",\"entity_id\",\"property\",\"value\",\"implicit_marker\",\"level\"])\n",
" \n",
" # importing implicit markers dict\n",
" with open('/Users/anesterov/reps/LODlit/sensitivity_markers/implicit/implicit_markers.json','r') as jf:\n",
" implicit_markers = json.load(jf)\n",
" \n",
" # importing all search results\n",
" with open(f\"/Users/anesterov/LODlit_local/wd/jan31/results_clean_{lang}.json\",'r') as jf:\n",
" wd_all = json.load(jf)\n",
" \n",
" # import subset\n",
" with open(f\"/Users/anesterov/reps/LODlit/Wikidata/wd_{lang}_subset.json\",'r') as jf:\n",
" wd_subset = json.load(jf)\n",
" \n",
" # get all QIDs in the subset\n",
" subset_quids = []\n",
" for hits in wd_subset.values():\n",
" for hit in hits:\n",
" subset_quids.append(hit[\"QID\"])\n",
" \n",
" # import rm\n",
" wd_rm = pd.read_csv(\"/Users/anesterov/reps/LODlit/rm/rm_entities_unique.csv\")\n",
" rm_quids = list(wd_rm[wd_rm[\"resource\"] == \"wikidata\"][wd_rm[\"lang\"] == lang][\"entity_id\"])\n",
" \n",
" # searching in descriptions\n",
" for term, hits in wd_all.items():\n",
" lemma = get_lemma_by_term(term, lang)\n",
" \n",
" for hit in hits:\n",
"\n",
" level = \"1\"\n",
" # check entity level\n",
" if hit[\"QID\"] in set(subset_quids):\n",
" level = \"2\"\n",
" if hit[\"QID\"] in set(rm_quids):\n",
" level = \"3\"\n",
"\n",
" # check descriptions type\n",
" if type(hit[\"description\"]) == list:\n",
" for d in hit[\"description\"]:\n",
" # iterating over all markers\n",
" for marker in implicit_markers[\"wikidata\"][lang]:\n",
" match = re.search(f\"\\\\b{marker}\\\\b\",d)\n",
" if match:\n",
" row = [\"wikidata\",lang,lemma,hit[\"QID\"],\"description\",d,match[0],level]\n",
" wd_implicit.loc[len(wd_implicit)] = row\n",
"\n",
" if type(hit[\"description\"]) == str:\n",
" for marker in implicit_markers[\"wikidata\"][lang]:\n",
" match = re.search(f\"\\\\b{marker}\\\\b\",hit[\"description\"])\n",
" if match:\n",
" row = [\"wikidata\",lang,lemma,hit[\"QID\"],\"description\",hit[\"description\"],match[0],level]\n",
" wd_implicit.loc[len(wd_implicit)] = row\n",
" \n",
" return wd_implicit.drop_duplicates(ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-3-93c9a52dc902>:29: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
" rm_quids = list(wd_rm[wd_rm[\"resource\"]==\"wikidata\"][wd_rm[\"lang\"]==lang][\"entity_id\"])\n"
]
}
],
"source": [
"wd_nl_implicit = search_implicit_markers_wd(\"nl\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# export csv\n",
"wd_nl_implicit.to_csv(\"wd_nl_implicit.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## AAT"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
77 changes: 77 additions & 0 deletions sensitivity_markers/implicit/wd_en_implicit.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
,resource,en,lemma,entity_id,property,value,implicit_marker,level
0,wikidata,en,immigrant,Q1532765,description,derogatory term mostly referring to Mexican illegal immigrants to the USA,derogatory term,1
1,wikidata,en,immigrant,Q8072681,description,slang term used by Italian American mobsters to refer to newer immigrant Italian mobsters,slang term,1
2,wikidata,en,barbarian,Q134313,description,person perceived to be either uncivilized or primitive based on stereotypes,stereotypes,3
3,wikidata,en,indian,Q6385618,description,derogatory term for ethnic Indians in Southeast Asia,derogatory term,1
4,wikidata,en,indian,Q12643403,description,obsolete name for a group of Ópata Indians,obsolete name,1
5,wikidata,en,servant,Q5913718,description,derogatory term Afro-American who worked as a servant in post-slavery America,derogatory term,1
6,wikidata,en,servant,Q10705205,description,older term for domestic worker staying a long time for the same employer's family,older term,1
7,wikidata,en,servant,Q56011985,description,"A humorous novel (though with some racial stereotyping) of a negro servant and his master, a planter.",stereotyping,1
8,wikidata,en,hottentot,Q1631241,description,antiquated term for the Khoisan people of southern Africa.,antiquated,3
9,wikidata,en,gypsy,Q76475,description,(denigrating) collective term for Roma and Yenish,denigrating,1
10,wikidata,en,white,Q7129609,description,outdated classification of humans,outdated classification,3
11,wikidata,en,white,Q1066801,description,pejorative,pejorative,1
12,wikidata,en,white,Q95530210,description,a common given name used as pejorative American slang term referring to a annoying white entitled woman,slang term,1
13,wikidata,en,white,Q95530210,description,a common given name used as pejorative American slang term referring to a annoying white entitled woman,pejorative,1
14,wikidata,en,white,Q1135775,description,"derogatory term for poor, rural, usually white and male, person from the Southern United States",derogatory term,1
15,wikidata,en,white,Q27556771,description,derogatory term,derogatory term,1
16,wikidata,en,white,Q7549820,description,informal term for the white solid that results from the addition of soap to hard water,informal term,1
17,wikidata,en,white,Q45991,description,slang term for white person acting like a black person,slang term,1
18,wikidata,en,white,Q6880109,description,pejorative expression previously used within the African-American community to refer to an imperious white man,pejorative,1
19,wikidata,en,caucasian,Q7129609,description,outdated classification of humans,outdated classification,3
20,wikidata,en,kaffir,Q1285250,description,"a slur used to refer to indigenous black people in South Africa; derived from the Arabic term for ""unbeliever"", i.e. pagan",slur,3
21,wikidata,en,descent,Q906267,description,"ethnic slur directed at people of Chinese descent, originated as the Japanese transliteration for word ""China""",slur,1
22,wikidata,en,mohammedan,Q2367463,description,formerly common term referring to Islam,formerly common term,3
23,wikidata,en,mongoloid,Q11410826,description,historical term used to refer to a sub-type of the Mongoloid race and classified by William W. Howells,historical term,2
24,wikidata,en,homosexual,Q14634937,description,disparaging term used by opponents of gay rights activism,disparaging term,1
25,wikidata,en,homosexual,Q5891541,description,archaic term for homosexual,archaic term,1
26,wikidata,en,homosexual,Q7831920,description,colloquial term for a casual homosexual partner,colloquial term,1
27,wikidata,en,homosexual,Q98113730,description,pejorative term for LGBT movements,pejorative,1
28,wikidata,en,native,Q2023990,description,obsolete term for a physical type most common among populations native to the Iranian plateau,obsolete term,1
29,wikidata,en,native,Q17144151,description,obsolete term for mixed Native American and European ancestry,obsolete term,3
30,wikidata,en,negro,Q1130557,description,historical word for black people,historical word,3
31,wikidata,en,negro,Q114217628,description,historical term in the Afroamerican emancipation,historical term,1
32,wikidata,en,negro,Q5913718,description,derogatory term Afro-American who worked as a servant in post-slavery America,derogatory term,1
33,wikidata,en,negro,Q56011985,description,"A humorous novel (though with some racial stereotyping) of a negro servant and his master, a planter.",stereotyping,1
34,wikidata,en,berber,Q410453,description,"A historical term for all or most of sub-Saharan Africa or Black Africa, inclusive of much more than modern Ethiopia but almost never Egypt, Tunisia, or the land of the Berbers; in Greco-Roman mythology, considered devout and close to the gods",historical term,2
35,wikidata,en,mulatto,Q4689037,description,French pejorative term referring to a mulatto,pejorative,1
36,wikidata,en,mulatto,Q7832650,description,stereotypical fictional character in 19th and 20th century American literature,stereotypical,2
37,wikidata,en,aboriginal,Q16211163,description,term of self-reference for many Australian aboriginals,term of self-reference,1
38,wikidata,en,aboriginal,Q442425,description,self identifying term used by number of Aboriginal groups in Austealia,self identifying term,1
39,wikidata,en,aboriginal,Q5940484,description,"slang term meaning ""begging"" in some Australian Aboriginal communities",slang term,1
40,wikidata,en,gay,Q14634937,description,disparaging term used by opponents of gay rights activism,disparaging term,1
41,wikidata,en,gay,Q5528676,description,outdated medical term,outdated medical term,1
42,wikidata,en,gay,Q898608,description,"pejorative term for the alleged disproportional behind-the-scene influence of gay rights groups and the LGBT community in politics, media, culture, and everyday life",pejorative,1
43,wikidata,en,gay,Q26791,description,a gay slang term for a woman who associates either mostly or exclusively with gay men,slang term,1
44,wikidata,en,gay,Q3624420,description,slang term for overweight gay man,slang term,1
45,wikidata,en,gay,Q5207989,description,slang term in gay culture meaning an older man sexually involved in a relationship or having a sexual interest in a younger man,slang term,2
46,wikidata,en,gay,Q7845166,description,gay slang term for men who cruise for sex or potential sex partners or experiences,slang term,1
47,wikidata,en,gay,Q2919166,description,gay slang term for a gay man,slang term,1
48,wikidata,en,gay,Q1744361,description,gay slang term,slang term,1
49,wikidata,en,gay,Q55657493,description,"derogatory term ridiculing Europe's tolerance toward gay people, implying that all Europe is populated by gay people",derogatory term,1
50,wikidata,en,black,Q817393,description,"a racialized classification of people, usually a political and skin color-based category for specific populations with a mid to dark brown complexion",racialized classification,3
51,wikidata,en,black,Q1130557,description,historical word for black people,historical word,3
52,wikidata,en,black,Q1455718,description,slur against black people,slur,1
53,wikidata,en,black,Q410453,description,"A historical term for all or most of sub-Saharan Africa or Black Africa, inclusive of much more than modern Ethiopia but almost never Egypt, Tunisia, or the land of the Berbers; in Greco-Roman mythology, considered devout and close to the gods",historical term,2
54,wikidata,en,black,Q1285250,description,"a slur used to refer to indigenous black people in South Africa; derived from the Arabic term for ""unbeliever"", i.e. pagan",slur,3
55,wikidata,en,black,Q4117010,description,colloquial term derived from the ethnic slur against black people,slur,1
56,wikidata,en,black,Q4117010,description,colloquial term derived from the ethnic slur against black people,colloquial term,1
57,wikidata,en,black,Q2265295,description,genre of music that presented a stereotype of black people,stereotype,1
58,wikidata,en,black,Q45991,description,slang term for white person acting like a black person,slang term,1
59,wikidata,en,queer,Q115870510,description,identity term of a LGBT+ person,identity term,2
60,wikidata,en,ethnic groups,Q1371427,description,derogatory term for an ethnic group,derogatory term,1
61,wikidata,en,ethnic groups,Q62928,description,obsolete term for an ethnic group in the Middle East,obsolete term,1
62,wikidata,en,ethnic groups,Q5158607,description,slang term for ethnic group,slang term,1
63,wikidata,en,half-breed,Q17144151,description,obsolete term for mixed Native American and European ancestry,obsolete term,3
64,wikidata,en,race,Q62928,description,obsolete term for an ethnic group in the Middle East,obsolete term,1
65,wikidata,en,race,Q2090033,description,pejorative term referring to interethnic relationships resulting in multiethnic offspring,pejorative,1
66,wikidata,en,primitive,Q134313,description,person perceived to be either uncivilized or primitive based on stereotypes,stereotypes,3
67,wikidata,en,retarded,Q18394147,description,pejorative slur for one with a mental disability,slur,3
68,wikidata,en,retarded,Q18394147,description,pejorative slur for one with a mental disability,pejorative,3
69,wikidata,en,slave,Q4666486,description,"Arabic pejorative meaning ""slave""",pejorative,1
70,wikidata,en,ethnicity,Q2920284,description,derogatory term for an Israeli man of Mizrahi ethnicity,derogatory term,1
71,wikidata,en,ethnicity,Q5101084,description,English-language ethnic slur usually referring to a person of Chinese or East Asian ethnicity,slur,1
72,wikidata,en,indigenous,Q1285250,description,"a slur used to refer to indigenous black people in South Africa; derived from the Arabic term for ""unbeliever"", i.e. pagan",slur,3
73,wikidata,en,indigenous,Q5140747,description,pejorative term referring to indigenous peoples of the United States and Canada,pejorative,1
74,wikidata,en,indigenous,Q16842904,description,informal term used in Australian English to refer to Indigenous Australians,informal term,1
75,wikidata,en,tribe,Q7855217,description,"historical term for ""military division, clan, or tribe"" among the Turkic peoples",historical term,1
15 changes: 15 additions & 0 deletions sensitivity_markers/implicit/wd_nl_implicit.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
,resource,nl,lemma,entity_id,property,value,implicit_marker,level
0,wikidata,nl,lilliputter,Q2419953,description,informele term voor een mens met dwerggroei,informele term,3
1,wikidata,nl,homo,Q1975772,description,denigrerende term voor homo's,denigrerende,1
2,wikidata,nl,homo,Q55657493,description,"denigrerende term die de tolerantie van Europa ten opzichte van homo's belachelijk maakt, wat impliceert dat heel Europa door homo's wordt bevolkt",denigrerende,1
3,wikidata,nl,westers,Q30674738,description,denigrerend Chinees neologisme en politieke bijnaam voor westerse linkse ideologieën,denigrerend,1
4,wikidata,nl,neger,Q1130557,description,historische term voor zwart persoon,historische term,3
5,wikidata,nl,inuk,Q189975,description,naam waarmee de Eskimo's van Groenland en Canada zichzelf aanduiden,zichzelf aanduiden,3
6,wikidata,nl,blank,Q698752,description,racistische ideologie,racistische,2
7,wikidata,nl,blank,Q107405878,description,betreft de discussie of het gebruik van de term 'blank' racistisch is,racistisch,2
8,wikidata,nl,zigeuner,Q76475,description,"als denigrerend beschouwde naam voor de Roma/Jenische, soms ook spreekwoordelijk gebruikt",denigrerend,3
9,wikidata,nl,zwart,Q1130557,description,historische term voor zwart persoon,historische term,3
10,wikidata,nl,dwerg,Q2419953,description,informele term voor een mens met dwerggroei,informele term,3
11,wikidata,nl,ras,Q1130557,description,historische term voor zwart persoon,historische term,3
12,wikidata,nl,eskimo,Q189975,description,naam waarmee de Eskimo's van Groenland en Canada zichzelf aanduiden,zichzelf aanduiden,3
13,wikidata,nl,kaffer,Q1285250,description,"betekent ongelovige, ook wel voor sukkel of lomperik, en een scheldwoord",scheldwoord,3

0 comments on commit 5c6a611

Please sign in to comment.