ModelProduction.py

# -*- coding: utf-8 -*-
"""TER globale.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/15lPys0g207RL6O9F3n1_U2yhQzBToznj

## Installation et import des librairies
"""

!pip install twarc
!pip install wget

# Install LDA Libraries
!pip install textblob
!pip install tweepy
!pip install pycountry
!pip install  wordcloud 
!pip install langdetect 
!pip install pyLDAvis

#temp

#!pip install networkx==1.11

import sys
import os
import requests
import re
import tweepy
from geopy.geocoders import Nominatim
import json
geolocator = Nominatim(user_agent="https://colab.research.google.com/drive/15lPys0g207RL6O9F3n1_U2yhQzBToznj?usp=sharing")

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import plotly.express as px


from textblob import TextBlob
import pycountry
import re
import string
import folium

from wordcloud import WordCloud, STOPWORDS
from PIL import Image
from langdetect import detect

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist

import gensim
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import pickle

import pandas as pd
import requests
import gzip
import shutil
import wget
import csv
import linecache
from shutil import copyfile
import ipywidgets as widgets
import numpy as np
from datetime import datetime  
from datetime import timedelta  

# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

!twarc configure --consumer_key="sCEQKMT4PE6MC3NiorTZpPBMM"y --consumer_secret="s90Ew2cJb32oFyF4EIMdxfotjBQckbm6OkE6hKpMpbKKwCRlAR" --access_token="1358769056766164996-qtIxIlGMiLeOFi2K6MuKxh5MgnI3sc" --access_token_secret="cSSZsEzzbBAAQxGmprgCAWR73OC2U4lJl5Cr9vlbMe9mH"

"""## Extraction des données"""

tweetsDf = pd.DataFrame()
  
#Add 1 day  
date = datetime.strptime("2020-03-20", "%Y-%m-%d") + timedelta(days=1)

for i in range(374):
  date = date + timedelta(days=1)

  #Downloads the dataset (compressed in a GZ format)
  #!wget dataset_URL -O clean-dataset.tsv.gz
  dateStr = str(date.year) + ('-0' + str(date.month) if date.month < 10 else '-' + str(date.month)) + ('-0' + str(date.day) if date.day < 10 else '-' + str(date.day))

  
  url = "https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/" + dateStr + "/" + dateStr + "_clean-dataset.tsv.gz?raw=true"
  print(url)
  wget.download(url, out='clean-dataset.tsv.gz')

  #Unzips the dataset and gets the TSV dataset
  with gzip.open('clean-dataset.tsv.gz', 'rb') as f_in:
      with open('clean-dataset.tsv', 'wb') as f_out:
          shutil.copyfileobj(f_in, f_out)

  #Deletes the compressed GZ file
  os.unlink("clean-dataset.tsv.gz")

  #Gets all possible languages from the dataset
  df = pd.read_csv('clean-dataset.tsv',sep="\t")
  #df = df.sort_values(by='favorite_count', ascending=False)
  tweetsDf = pd.concat([tweetsDf, df.sample(2000)])

print(tweetsDf.head())

!pip install pandas==1.1.5

tweetsDf[["tweet_id"]].to_csv("tweets.csv", index=False, header=False)

pd.read_csv("tweets.csv").shape

! twarc hydrate tweets.csv > "/content/drive/My Drive/Data/covid-tweets.jsonl"

tweets = pd.read_json("/content/drive/My Drive/Data/covid-tweets.jsonl", lines=True, chunksize=20000)

f = open("/content/drive/My Drive/Data/covid-tweets-en.jsonl", 'w')
f.close()

for chunk in tweets:
  out = chunk[chunk.lang=="en"].to_json(orient='records', lines=True)
  with open("/content/drive/My Drive/Data/covid-tweets-en.jsonl", 'a') as f:
    f.write(out)
    f.write('\n')

tweetsDf = pd.read_json("/content/drive/My Drive/Data/covid-tweets-en.jsonl", orient='records', lines=True, chunksize=20000)

lines=0
df = pd.DataFrame()
for chunk in tweetsDf:
  chunk.drop(["truncated", "display_text_range", "in_reply_to_status_id", "in_reply_to_status_id_str",
              "in_reply_to_user_id", "in_reply_to_user_id_str", "in_reply_to_screen_name", "is_quote_status", "quoted_status_id",
              'quoted_status_id_str', 'quoted_status_permalink', 'quoted_status', 'possibly_sensitive', 'extended_entities', 'withheld_in_countries'],axis=1, inplace=True)
  df = pd.concat([df, chunk], axis=0)

df.shape

df.head()

df.to_json("/content/drive/My Drive/Data/covid-tweets-final.jsonl", orient="records", lines=True)

"""## Lecture des tweets à partir du fichier"""

tweetsDf = pd.read_json("/content/drive/My Drive/Data/covid-tweets-final.jsonl", orient='records', lines=True, chunksize=20000)

df = pd.DataFrame()
for chunk in tweetsDf:
  df = pd.concat([df, chunk], axis=0)

tweetsDf = df.copy()

"""## Preprocessing"""

def remove_urls(text):
  result = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',' ',text)
  return result

stop_words = stopwords.words('english')
#add punctuation char's to stopwords list
stop_words += list(string.punctuation) # <-- contains !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
#add integers
stop_words += ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

def tokenize_lowercase(text):
    tokens = nltk.word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
    return stopwords_removed
  

def remove_nums(text_object):
    no_nums = list(filter(lambda x: x.isalpha(), text_object))
    return no_nums


lemmatizer = WordNetLemmatizer()
def lemmatize_text(df_text):
    lemmatized =[]
    for w in df_text:
        if(len(w)>3):
          lemmatized.append(lemmatizer.lemmatize(w))

    return lemmatized


def list_to_sentence(list_words):
  return " ".join(list_words)


def preprocess(text):
  return lemmatize_text(remove_nums(tokenize_lowercase(remove_urls(text))))

tweetsDf['full_text'] = tweetsDf['full_text'].map(remove_urls)
tweetsDf['full_text'] = tweetsDf['full_text'].apply(tokenize_lowercase)
tweetsDf['full_text'] = tweetsDf['full_text'].apply(remove_nums)
tweetsDf['full_text'] = tweetsDf['full_text'].apply(lemmatize_text)

#tweetsDf.style.set_properties(subset=['full_text'], **{'width': '1000px'})
tweetsDf['full_text'].head()

"""## LDA

### Dictionaire de données
"""

dictionary = gensim.corpora.Dictionary(tweetsDf['full_text'])

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

dictionary.filter_extremes(no_above=0.8, no_below=15, keep_n=100000)

"""### Bag of words"""

bow_corpus = [dictionary.doc2bow(doc) for doc in tweetsDf['full_text']]

"""### TF-IDF"""

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

for doc in corpus_tfidf[1:]:
    pprint(doc)
    break

"""### Entrainnement du modèle"""

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

"""### Prediction du topic d'un nouveau tweet"""

unseen_document = 'I’d suggest that neighboring States control their frontiers & ask all Texans who want to cross State lines, have a Covid test! Otherwise all the efforts of good people in neighboring states, wearing masks & being careful, can be thrown in the air by these irresponsible governors!'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("{} - Score: {}\t Topic: {}".format(index, score, lda_model_tfidf.print_topic(index, 5)))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

"""### Visualisation des topics"""

pyLDAvis.enable_notebook()

vis = gensimvis.prepare(lda_model_tfidf, bow_corpus, dictionary=lda_model_tfidf.id2word)
vis

get_document_topics = loaded_model.get_document_topics(corpus_tfidf)
# get_document_topics = lda_model_tfidf.get_document_topics(corpus_tfidf)

"""### Sauvegarde des données et modèles"""

filename = '/content/drive/My Drive/models/lda_model_tfidf.sav'
pickle.dump(lda_model_tfidf, open(filename, 'wb'))

filename = '/content/drive/My Drive/models/lda_model.sav'
pickle.dump(lda_model, open(filename, 'wb'))

np.save("/content/drive/My Drive/models/lda_bow.npy", bow_corpus)

filename = '/content/drive/My Drive/models/lda_model_tfidf.sav'
loaded_model = pickle.load(open(filename, 'rb'))

bow_corpus = np.load('/content/drive/My Drive/models/lda_bow.npy', allow_pickle=True)

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(loaded_model, bow_corpus, dictionary=loaded_model.id2word)
pyLDAvis.save_html(vis,"lda_topic_vis.html")

def get_tweet_by_df_id(id,df):
  return df.iloc[id,:]['id']
  
cpt=0
docs = dict(list())
topic=4

for i in get_document_topics:
  tmp = sorted(i, key=lambda tup: tup[1])
  if(docs.get(tmp[0][0],None) is None):
     docs[tmp[0][0]]=[]
  docs[tmp[0][0]].append(get_tweet_by_df_id(cpt,df))
  cpt+=1

filename = '/content/drive/My Drive/models/tweetsIdsByTopic.npy'
pickle.dump(docs, open(filename, 'wb'))

filename = '/content/drive/My Drive/models/tweetsIdsByTopic.npy'
doc = pickle.load(open(filename, 'rb'))