-
Notifications
You must be signed in to change notification settings - Fork 0
/
do.py
137 lines (118 loc) · 6.17 KB
/
do.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from dowhy import CausalModel
from tqdm import tqdm
from pyspark.sql import SparkSession
from pyspark import SparkConf
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
student_nm = 'user_id'
knowledge_id = 'exercise'
response_id = 'correct'
difficulty_id = 'difficulty'
probability_id = 'probability_id'
def count_calls(func):
count = 0
def wrapper(*args, **kwargs):
nonlocal count
count += 1
print(f"`likelihood_function` 已被调用 {count} 次")
return func(*args, **kwargs)
return wrapper
def IRT_theta(do_data):
global student_nm, knowledge_id, response_id, difficulty_id, probability_id
data=do_data.copy()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_items = len(data[knowledge_id].unique())
num_students = len(data[student_nm].unique())
initial_ability = torch.zeros(num_students, requires_grad=True, device=device)
data[knowledge_id] = pd.Categorical(data[knowledge_id], categories=data[knowledge_id].unique()).codes.astype(
np.int64)
data[student_nm] = pd.Categorical(data[student_nm], categories=data[student_nm].unique()).codes.astype(np.int64)
data[response_id] = data[response_id].astype(np.float32)
data[difficulty_id] = data[difficulty_id].astype(np.float32)
knowledge_tensor = torch.tensor(data[knowledge_id].values, dtype=torch.int64).to(device)
student_tensor = torch.tensor(data[student_nm].values, dtype=torch.int64).to(device)
response_tensor = torch.tensor(data[response_id].values, dtype=torch.float32).to(device)
difficulty_tensor = torch.tensor(data[difficulty_id].values, dtype=torch.float32).to(device)
D = 1.702
def likelihood_function(ability, knowledge, student, response, difficulty):
theta_minus_b = ability[student] - difficulty
probability = 1 / (1 + torch.exp(-D * theta_minus_b))
log_likelihood = response * torch.log(probability) + (1 - response) * torch.log(1 - probability)
return -log_likelihood.sum()
dataset = TensorDataset(knowledge_tensor, student_tensor, response_tensor, difficulty_tensor)
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True)
optimizer = torch.optim.Adam([initial_ability], lr=0.001)
for epoch in tqdm(range(10)):
for batch_knowledge, batch_student, batch_response, batch_difficulty in dataloader:
optimizer.zero_grad()
loss = likelihood_function(initial_ability, batch_knowledge, batch_student, batch_response,
batch_difficulty)
loss.backward()
optimizer.step()
initial_ability = torch.tensor(initial_ability).cuda()
student_ids = data[student_nm].unique().tolist()
theta = {student_id: initial_ability[i].item() for i, student_id in
tqdm(enumerate(student_ids), total=len(student_ids), desc="Processing")}
theta_cpu = {key: val for key, val in theta.items()}
return theta_cpu
def IRT_P(data, theta):
global student_nm, knowledge_id, response_id, difficulty_id, probability_id
data_grouped = data.groupby([student_nm, knowledge_id], as_index=False)
num_rows = sum(len(data_grouped.get_group(group)) for group in data_grouped.groups)
col_names = ['student_nm', 'knowledge_id', 'probability_id']
result = pd.DataFrame(index=np.arange(num_rows), columns=col_names)
index = 0
D = 1.702
for group, subset in tqdm(data_grouped, total=len(data_grouped), desc="Students"):
student, knowledge = group
difficulty = subset[difficulty_id].values
theta_student = theta[student]
probability = 1 / (1 + np.exp(-D * (theta_student - difficulty)))
len_prob = len(probability)
result.iloc[index:index + len_prob] = np.column_stack(
([student] * len_prob, [knowledge] * len_prob, probability)
)
index += len_prob
return result
def do(data, variable, parents):
global student_nm, knowledge_id, response, difficulty, probability_id
data = data.pivot_table(index="student_nm", columns="knowledge_id", values="probability_id")
data.fillna(0, inplace=True)
#data需要处理成知识点对应的不同学生的作答正确概率
# KC_1 KC_2 KC_3 KC_4
# student_1 0.9 0.8 0.7 0.6
# student_2 0.3 0.4 0.1 0.2
# student_3 0.5 0.6 0.9 0.8
model = CausalModel(
data=data,
treatment=parents,
outcome=variable,
common_causes=[element for element in data.columns if element not in [parents, variable]]
)
identified_estimand = model.identify_effect()
#倾向性得分(离散)# Propensity Score Weighting(倾向性得分加权),Propensity Score Matching(倾向性得分匹配)
# estimate = model.estimate_effect(identified_estimand, method_name="backdoor.propensity_score_weighting")
# estimate = model.estimate_effect(identified_estimand, method_name="backdoor.propensity_score_matching")
# #线性回归(连续)
estimate = model.estimate_effect(identified_estimand, method_name="backdoor.linear_regression")
# #广义线性模型(Generalized Linear Model,GLM)用于建立与预测非正态响应变量的相关性
# from statsmodels.api import families
# method_params = {
# "glm_family": families.Binomial()
# }
# estimate = model.estimate_effect(identified_estimand, method_name="backdoor.generalized_linear_model",method_params=method_params)
#回归不连续性方法 Regression Discontinuity
# estimate = model.estimate_effect(identified_estimand, method_name="iv.regression_discontinuity",
# method_params={'rd_variable_name':'response',
# 'rd_threshold_value':0.5,
# 'rd_bandwidth': 0.15})
# #工具变量法
# identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
# estimate = model.estimate_effect(identified_estimand,
# method_name="iv.instrumental_variable",
# test_significance=True)
return estimate.value