forked from martinalex7/ProjetPython2A
-
Notifications
You must be signed in to change notification settings - Fork 1
/
functions.py
130 lines (108 loc) · 3.58 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import re
import numpy as np
import nltk
from nltk import word_tokenize
from lyricsgenius import Genius
import collections
from nltk.stem import WordNetLemmatizer
#nltk.download('omw-1.4')
WNL = WordNetLemmatizer()
def lyrics_for_df(id_):
return Genius.lyrics(song_id=id_)
#
def intro_detection(lyrics):
regex_indications = re.compile('\[(.*?)\]')
details = [i[0] for i in regex_indications.finditer(str(lyrics))]
for element in details:
if 'Intro' in element:
return 1
return 0
def outro_detection(lyrics):
regex_indications = re.compile('\[(.*?)\]')
details = [i[0] for i in regex_indications.finditer(str(lyrics))]
for element in details:
if 'Outro' in element:
return 1
return 0
def nbr_verses(lyrics):
compteur = 0
regex_indications = re.compile('\[(.*?)\]')
details = [i[0] for i in regex_indications.finditer(str(lyrics))]
for element in details:
if 'Verse' in element:
compteur +=1
return compteur
def nbr_chorus(lyrics):
compteur = 0
regex_indications = re.compile('\[(.*?)\]')
details = [i[0] for i in regex_indications.finditer(str(lyrics))]
for element in details:
if 'Chorus' in element:
if not 'Pre-Chorus' in element:
compteur +=1
if 'Refrain' in element:
compteur +=1
return compteur
def nbr_parts(lyrics):
compteur = 0
regex_indications = re.compile('\[(.*?)\]')
details = [i[0] for i in regex_indications.finditer(str(lyrics))]
for element in details:
if 'Part' in element:
compteur +=1
return compteur
def nbr_interlude(lyrics):
compteur = 0
regex_indications = re.compile('\[(.*?)\]')
details = [i[0] for i in regex_indications.finditer(str(lyrics))]
for element in details:
if 'Interlude' in element:
compteur +=1
return compteur
def nbr_bridge(lyrics):
compteur = 0
regex_indications = re.compile('\[(.*?)\]')
details = [i[0] for i in regex_indications.finditer(str(lyrics))]
for element in details:
if 'Bridge' in element:
compteur +=1
return compteur
def nbr_pre_chorus(lyrics):
compteur = 0
regex_indications = re.compile('\[(.*?)\]')
details = [i[0] for i in regex_indications.finditer(str(lyrics))]
for element in details:
if 'Pre-Chorus' in element:
compteur +=1
return compteur
def lyrics_cleaning(lyrics):
regex_indications = re.compile('\[(.*?)\]')
details = [i[0] for i in regex_indications.finditer(str(lyrics))]
for element in details:
lyrics = str(lyrics).replace(element,' ')
lyrics = str(lyrics).replace('\n',' ')
lyrics = str(lyrics).replace('\'',' ')
lyrics = str(lyrics).replace('25EmbedShare',' ')
lyrics = str(lyrics).replace('URLCopyEmbedCopy',' ')
lyrics = ' '.join(smallest_lemma(WNL.lemmatize(word.lower()),
WNL.lemmatize(word.lower(),pos='v')) for word in word_tokenize(str(lyrics)) if word.isalpha())
return str(lyrics)
def tokenized_lyrics(lyrics):
return word_tokenize(str(lyrics))
def smallest_lemma(word1,word2):
if len(word1) >= len(word2):
return word2
else:
return word1
def release_date(date):
return date['year']
def dict_freq_words(tok_lyrics):
return dict(sorted(collections.Counter(tok_lyrics).items(), key = lambda item : item[1], reverse = True))
def featuring(vec):
col1,col2 = vec[0],vec[1]
if col1 == col2:
return 0
else:
return 1
def len_song(tok_lyrics):
return len(tok_lyrics)