-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessor.py
73 lines (63 loc) · 2.32 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import re
import pandas as pd
def preprocess(data):
# Regular expression pattern to match the date, time, and am/pm
pattern = r'(\d{1,2}/\d{1,2}/\d{2}), (\d{1,2}:\d{2})\s?[^\S\r\n]*([ap]m) - (.*?): '
# Lists to hold the extracted data
dates = []
times = []
periods = []
senders = []
messages = []
# Temporary variables to handle multiline messages
current_message = ""
current_date = ""
current_time = ""
current_period = ""
current_sender = ""
# Process the dataset
for line in data.split('\n'):
match = re.match(pattern, line)
if match:
if current_message: # Save the previous message if there is one
messages.append(current_message.strip())
# Extract new date, time, period, and sender
current_date = match.group(1)
current_time = match.group(2)
current_period = match.group(3)
current_sender = match.group(4)
# Append the date, time, and sender to their respective lists
dates.append(current_date)
times.append(current_time)
periods.append(current_period)
senders.append(current_sender)
# Start a new message
current_message = line[match.end():].strip()
else:
# Continuation of the previous message
current_message += "\n" + line.strip()
# Append the last message
if current_message:
messages.append(current_message.strip())
# Debugging: print lengths of all lists
messages.pop()
print(f"Lengths -> Dates: {len(dates)}, Times: {len(times)}, Periods: {len(periods)}, Senders: {len(senders)}, Messages: {len(messages)}")
# Create the DataFrame
df = pd.DataFrame({
'Date': dates,
'Time': times,
'Period': periods,
'Sender': senders,
'Message': messages
})
df=df[1:]
df['Date']=pd.to_datetime(df['Date'], format='%d/%m/%y')
df['year']=df['Date'].dt.year
df['month']=df['Date'].dt.month_name()
df['date'] = df['Date'].dt.day
df['time']=pd.to_datetime(df['Time'], format='%H:%M')
df['hour']=df['time'].dt.hour
df['min']=df['time'].dt.minute
df['time']=df['time'].dt.time
df['Day_name']=df['Date'].dt.day_name()
return df