-
Notifications
You must be signed in to change notification settings - Fork 1
/
MinimalPairsAnalysis.py
147 lines (110 loc) · 4.94 KB
/
MinimalPairsAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
This script reads a csv with English phoneme confusability data and a csv with
distinctive features for English phonemes, and then scatterplots and correlates
the confusability of a contrast with the similarity of that contrast in
distinctive features.
NOTO BENE!!!!! For this quick and dirty analysis, I apparently just found some
Australian English features! The confusability data are taken from Graff
(2012)'s paper, which themselves were taken from Miller and Nicely (1955).
If I ever went somewhere serious with this, obviously get distinctive features
for American English!
It does appear that confusability is highly correlated with number of features
in common! I think Pearson R is okay for now -- scatterplot appears to be
roughly bivariate normally distributed, both X and Y are unbounded, etc.
So next step -- modify Graff/Martin's model with an additional layer for
distinctive features!
"""
import os, itertools
import numpy as np
import pandas as pd
from numpy import genfromtxt
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
pd.options.display.mpl_style = 'default'
os.chdir('/Users/russellrichie/Richie extension of Graff (2012)/Richie-extension-of-Graff-2012')
"""
Read in confusability data, make a dict of, e.g., confDataDict[frozenset('b','d')] = 3.4
"""
confData = genfromtxt('confusability data.csv', delimiter=',',dtype=str)
confDataDict = dict()
for row in confData[2:]:
for colind, letter in enumerate(confData[0][1:]):
confDataDict[frozenset((letter,row[0]))] = row[colind + 1]
"""
Read in distinctive features data, make a dict of, e.g., distFeatDataDict[frozenset('b','d')] = 1
"""
distFeatData = genfromtxt('english distinctive features.csv', delimiter=',',dtype=str)[1:] #discard the first row, which is just column labels
distFeatDataDict = dict()
for rowPair in itertools.combinations(distFeatData,2): #make a similar dict of distFeatDataDict[frozenset('b','d')] = 3 or whatever the value is
featDifference = sum(1 for x, y in zip(rowPair[0][1:],rowPair[1][1:]) if x!=y) # rowPair[#][1:]
distFeatDataDict[frozenset((rowPair[0][0],rowPair[1][0]))] = featDifference
"""
Put the dist feat data and the confusability data together...
"""
allData = []
for confKey, confValue in confDataDict.items():
try:
allData.append((confKey,float(confValue),distFeatDataDict[confKey]))
except:
continue
"""
...then unzip it so they're each in their own lists again (but sorted!), ready
for plotting and correlation.
"""
listedData = zip(*allData)
labels = listedData[0]
confDataClean = np.array(listedData[1])
distFeatDataClean = np.array(listedData[2])
m, b = np.polyfit(confDataClean, distFeatDataClean, 1)
fig = plt.figure()
ax = plt.axes()
points_with_annotation = []
for label, x, y in zip(labels, confDataClean, distFeatDataClean):
point, = plt.plot(x, y, 'bo-', markersize=10, markerfacecolor=(1, 1, 0, 0.5))
label = list(label)
annotation = ax.annotate(str(label),
xy=(x, y), xycoords='data',
xytext=(x + 1, y), textcoords='data',
horizontalalignment="left",
fontsize=15,
arrowprops=dict(arrowstyle="simple",
connectionstyle="arc3,rad=-0.2"),
bbox=dict(boxstyle="round", facecolor="w",
edgecolor="0.5", alpha=0.9,
)
)
# by default, disable the annotation visibility
annotation.set_visible(False)
points_with_annotation.append([point, annotation])
def on_move(event):
visibility_changed = False
for point, annotation in points_with_annotation:
should_be_visible = (point.contains(event)[0] == True)
if should_be_visible != annotation.get_visible():
visibility_changed = True
annotation.set_visible(should_be_visible)
if visibility_changed:
plt.draw()
on_move_id = fig.canvas.mpl_connect('motion_notify_event', on_move)
plt.plot(confDataClean, m * confDataClean + b, '-')
#plt.show(plt.scatter(confDataClean,distFeatDataClean))
plt.suptitle('More discriminable contrasts differ on more features', fontsize=30)
plt.xlabel('M&N Discriminability score', fontsize=24)
plt.ylabel('Distance in number of distinctive features', fontsize=24)
plt.xticks(np.arange(.5,5.5,.5), fontsize = 20)
plt.yticks(np.arange(0,12,2), fontsize = 20)
rText = "r = {}, p={}".format(round(pearsonr(confDataClean,distFeatDataClean)[0],3),round(pearsonr(confDataClean,distFeatDataClean)[1],5))
plt.text(1, 9, rText, fontsize=25)
plt.show()
"""
for label, x, y in zip(labels, confDataClean, distFeatDataClean):
label = list(label)
label.insert(1,', ')
plt.annotate(
''.join(label),
xy = (x, y), xytext = (-20, 20),
textcoords = 'offset points', ha = 'right', va = 'bottom',
bbox = dict(boxstyle = 'round,pad=0.2', fc = 'yellow', alpha = 0.5),
arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
"""
#print pearsonr(confDataClean,distFeatDataClean)