-
Notifications
You must be signed in to change notification settings - Fork 0
/
epicosm.py
155 lines (121 loc) · 5.79 KB
/
epicosm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# -*- coding: utf-8 -*-
import os
import sys
import glob
import argparse
import time
import datetime
import subprocess
import signal
import schedule
# from ./modules
from modules import mongo_ops, epicosm_meta, twitter_ops, env_config, mongodb_config
def args_setup():
parser = argparse.ArgumentParser(description="Epidemiology of Cohort Social Media",
epilog="Example: python3 epicosm.py --harvest --repeat")
parser.add_argument("--harvest", action="store_true",
help="Harvest tweets from all users from a file called user_list (provided by you) with a single user per line.")
parser.add_argument("--get_friends", action="store_true",
help="Create a database of the users that are being followed by the accounts in your user_list. (This process can be very slow, especially if your users are prolific followers.)")
parser.add_argument("--repeat", action="store_true",
help="Repeat the harvest every 72 hours. This process will need to be put to the background to free your terminal prompt.")
parser.add_argument("--refresh", action="store_true",
help="If you have a new user_list, this will tell Epicosm to switch to this list.")
parser.add_argument("--start_db", action="store_true",
help="Start the MongoDB daemon in this folder, but don't run any Epicosm processes.")
parser.add_argument("--stop", action="store_true",
help="Stop all Epicosm processes.")
parser.add_argument("--shutdown_db", action="store_true",
help="Stop all Epicosm processes and shut down MongoDB.")
args = parser.parse_args()
return parser, args
def main():
# Set paths as instance of EnvironmentConfig
env = env_config.EnvironmentConfig()
# print help message if no/wrong args provided
if len(sys.argv) < 2:
parser.print_help()
sys.exit(0)
if args.stop or args.shutdown_db:
if args.shutdown_db:
mongo_ops.stop_mongo(env.db_path)
print(f"OK, stopping Epicosm processes.")
subprocess.call(["pkill", "-15", "-f", "epicosm"])
sys.exit(0)
# check running method
epicosm_meta.native_or_compiled()
# check environment
(mongod_executable_path, mongoexport_executable_path,
mongodump_executable_path, screen_names) = epicosm_meta.check_env()
# start mongodb
mongo_ops.start_mongo(mongod_executable_path,
env.db_path,
env.db_log_filename,
env.epicosm_log_filename)
if args.start_db:
print(f"OK, MongoDB started, but without Epicosm processes.")
sys.exit(0)
# verify credentials
credentials, auth, api = twitter_ops.get_credentials()
# set up logging
epicosm_meta.logger_setup(env.epicosm_log_filename)
# setup signal handler
signal.signal(signal.SIGINT, epicosm_meta.signal_handler)
# modify status file
epicosm_meta.status_up(env.status_file)
# tidy up the database for better efficiency
mongo_ops.index_mongo(env.run_folder)
# get persistent user ids from screen names
if args.refresh or not os.path.exists(env.run_folder + "/user_list.ids"):
twitter_ops.lookup_users(env.run_folder, screen_names, credentials, auth, api, args)
# get tweets for each user and archive in mongodb
if args.harvest:
try:
twitter_ops.harvest(env.run_folder, credentials, auth, api,
mongodb_config.client, mongodb_config.db, mongodb_config.collection)
except: # catching db down issues
print(f"Is the DB down? Trying to restart...")
mongo_ops.stop_mongo(env.db_path)
mongo_ops.start_mongo(mongod_executable_path,
env.db_path,
env.db_log_filename,
env.epicosm_log_filename)
twitter_ops.harvest(env.run_folder, credentials, auth, api,
mongodb_config.client, mongodb_config.db, mongodb_config.collection)
# if user wants the friend list, make it
if args.get_friends:
twitter_ops.get_friends(env.run_folder, credentials, auth,
api, mongodb_config.friends_collection)
sys.argv.remove("--get_friends") # we only want to do this once
# create CSV file of users' friends list.
mongo_ops.export_csv_friends(mongoexport_executable_path,
env.csv_friends_filename,
env.epicosm_log_filename)
# backup database into BSON
mongo_ops.backup_db(mongodump_executable_path,
env.database_dump_path,
env.epicosm_log_filename,
env.processtime)
# rotate backups - if there are more than 3, remove the oldest one
current_backup_count = len([name for name in os.listdir(env.database_dump_path + "/twitter_db") if os.path.isfile(os.path.join(env.database_dump_path + "/twitter_db", name))])
# each backup is one bson and one json of metadata, so 6 = 3 backups.
if current_backup_count > 6:
print("Rotating backups.")
bu_list = glob.glob(env.database_dump_path + "/twitter_db/tweets*")
bu_list.sort()
# remove the oldest two, a bson and a json
subprocess.call(["rm", bu_list[0]])
subprocess.call(["rm", bu_list[1]])
# modify status file
epicosm_meta.status_down(env.status_file, env.run_folder)
print(f"Scheduled task finished at {datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}.\n")
if __name__ == "__main__":
parser, args = args_setup()
if args.repeat:
main()
schedule.every(3).days.at("06:00").do(main)
while True:
schedule.run_pending()
time.sleep(15)
else:
main()