-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrapeECHOEPA
executable file
·117 lines (108 loc) · 5.5 KB
/
scrapeECHOEPA
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
#Imports
import csv
import csvkit
import os
import signal
import sys
import subprocess
from datetime import datetime
import glob
import db_connect
import EXP_PGM as pgm #script for creating views based on raw ECHOEPA data
import viewsToMaterialized as mview #script which creates materialized views of view
import re
from sqlalchemy import create_engine
#Set paths
os.chdir(os.path.dirname(sys.argv[0]))
#Connect to DB
engine,whichDb= db_connect.connect()
# Truncate
def truncate(table):
print ("truncating %s"%(table))
try:
with engine.connect() as connection:
connection.execute('truncate "%s"' %(table))
except Exception as e:
print (sys.exc_info()[0])
print (e.__class__.__name__)
sys.exit("Exit")
#used to compare number of rows in download csv to sql count(*)
def updateLastModified(table,zip):
file=open("CSV/%s.csv" %table)
reader=csv.reader(file)
csv_count=sum(1 for line in reader) - 1
file.close()
with engine.connect() as connection:
result= connection.execute('select count(*) from "%s"' %table)
sql_count=list(result.fetchone())[0]
with engine.connect() as connection:
results=connection.execute("update \"Last-Modified\" set \"csv_count\"=%s,\"sql_count\"=%s where \"name\"='%s'" %(csv_count,sql_count,table ))
#if results.rowcount == 0:
# results = engine.execute('insert into "Last-Modified" ("name","modified", "zip","csv_count", "sql_count") values (%s,%s,%s,%s,%s)' %(table,modified,zip,csv_count,sql_count))
print("update \"Last-Modified\" set \"csv_count\"=%s,\"sql_count\"=%s where \"name\"='%s'" %(csv_count,sql_count,table ))
results.close()
#commented out to run maually for now
#os.system("./wgetGSheet")
#Open list of files to scrape and add
with open('files.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
uniqueZips={}
for row in reader:
table=row['CSV FILE']
location=row['Location']
url=row['URL']
if location not in uniqueZips.keys():
uniqueZips[location] = row
if not os.path.isfile('zips/%s' %(location)):
modified=""
while(modified==""):
print("Getting %s" %(url))
modified = subprocess.check_output("./wgetEPA %s" %(url), shell=True).decode("utf-8").strip().splitlines()[0]
with engine.connect() as connection:
results = connection.execute('update "Last-Modified" set "modified"=\'%s\' where "zip" = \'%s\' and name=\'%s\'' %(modified,location,table))
if results.rowcount == 0:
with engine.connect() as connection:
results = connection.execute('insert into "Last-Modified" ("name","modified", "zip") values (\'%s\',\'%s\', \'%s\')' %(table,modified,location))
if re.match("csv$",url.lower()):
os.rename("zips/"+location, "CSV/"+location) #move file
elif ".zip" in url.lower():
os.system("./unzipEPA %s %s.csv" %(location,table)) #Unzip
try:
with engine.connect() as connection:
test = connection.execute("select 'public.\"%s\"'::regclass" %(table))
except:
print("no schema yet exists")
t = subprocess.check_output("./createTable %s" %(table), shell=True).decode("utf-8").strip()
#create the table
with engine.connect() as connection:
connection.execute(t)
#print("strip nulls")
os.system("./stripNulls %s" %(table)) #some csvs failed to import with nulls. this also cuts the CSV (head command)
truncate(table) # truncate deletes existing rows in a table
print ("truncated %s"%(table))
#process = subprocess.Popen(["./csvImport",table,whichDb], stdout=subprocess.PIPE, preexec_fn=os.setsid, stderr=subprocess.PIPE)
#out, err = process.communicate()
#os.killpg(os.getpgid(process.pid), signal.SIGTERM)
os.system("./csvImport %s %s" %(table,whichDb))
'''
if err:
os.system("./tools/csvsqlEPACreate_pg %s %s" %(table,whichDb))
'''
updateLastModified(table,location)
#Create views based off of ECHOEPA tables
print("creating views")
pgm.build(engine)
print("creating materialized views")
mview.build(engine) #materialized views
print("running sql analyze")
engine.execute("analyze");
#Clear up files
for csv in glob.glob("./CSV/*.csv"):
os.remove(csv)
for zip in glob.glob("./zips/*.zip"):
os.remove(zip)
#Switch DB
db_connect.change_db()
#Stony Brook Specific way to update remote currentDBIndex
os.system("./toggleIndex")