-
Notifications
You must be signed in to change notification settings - Fork 1
/
text_compare.py
228 lines (203 loc) · 10.5 KB
/
text_compare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#!/usr/bin/env python
"""
Compare 2 files
Generate a report with following stats:
Generate differences
The script takes a config file with following parameters as an argument
fileA - (Mandatory) First file to be taken for comparision
fileB - (Mandatory) Second file to be taken for comparision
reportfile - (Mandatory) Name of the output report file
fileADel - (Needed for delimited files) The delimiter for fileA
fileBDel - (Needed for delimited files) The delimiter for fileB
metafile=
keyfields=
ignorefields
skipRecs - (Mandatory for delimited files)
fileAOnly
fileBOnly
keyMismatchThreshold - (optional) Number of mismatch samples to report
"""
import sys
import csv
from datetime import datetime as dt
from collections import defaultdict, OrderedDict
def show_progress(iteration, total, prefix='', suffix='', decimals=1, barlength=50, clrlen=100):
"""
Call in a loop to create terminal progress bar
@params:
iteration - Required : Current iteration (Int)
total - Required : total iterations (Int)
prefix - Optional : prefix string (Str)
suffix - Optional : Suffix string (Str)
decimals - Optional : Positive number of decimals in percent complete (Int)
barlength - Optional : Character length of the bar (Int)
"""
clrstr = ' ' * clrlen
sys.stdout.write('\r%s' % clrstr)
sys.stdout.flush()
format_str = "{0:." + str(decimals) + "f}"
percents = format_str.format(100 * (iteration/float(total)))
filled_len = int(round(barlength * iteration/float(total)))
bar = chr(130) * filled_len + '-' * (barlength - filled_len)
sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix))
if iteration == total:
sys.stdout.write('\n')
sys.stdout.flush()
return len('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix))
def delimit(line, lengths):
"""Accept a string as 'line' and yeilds fields of 'lengths' passed """
offset = 0
for length in lengths:
yield line[offset:offset+length]
offset += length
def timestamp(message):
"""Write a message to stdout with the current timestamp"""
sys.stdout.write(dt.now().strftime("%x %X") + (': %s \n'% message))
def parse_config(configfile):
"""Parse the config file 'configfile' and return the parsed key-value pairs as dict"""
return {k:v for k,v in map(lambda x: x.strip().split('='), filter(lambda x: not x.strip().startswith('#'),
(line for line in open(configfile))))}
def get_key(data, keyfields):
"""Return a tuple of key with the data and keyfields indexes passed"""
return tuple([data[i-1] for i in map(int, keyfields.split(','))])
def write_exclusive_recs(config, filename, file_dict, excl_keys, fileDel):
'''
write a file with the records exclusive to a given side of the compare
'''
if excl_keys:
with open(filename,'w') as xonly_file:
xonly_file.writelines( [fileDel.join(file_dict[_]) + '\n' for _ in excl_keys ])
def get_diff(seqA, seqB, keyfields=None, ignorefields=None):
"""Takes following as parameters:
sequenceA, sequenceB,
list of indices that make the key for the record,
list of indices for the the fields that may be ignored
Returns a list of tuples in the form
(index of the field in the record, value on sideA, value on sideB, key for the record)"""
if keyfields:
if get_key(seqA, keyfields) != get_key(seqB, keyfields):
raise KeyError ('Key mismatched')
elif len(seqA) != len(seqB):
raise ValueError ('LengthMismatch')
return [(i,a,b, get_key(seqA, keyfields)) for i, (a,b) in enumerate(zip(seqA, seqB)) if a != b and str(i+1)] # Bug!! not in ignorefields]
def main(configfile):
'''
Does the following:
1. Parse the configfile to setup the compare parameters
2. Based on whether the files are delimited or fixed length, read the files as dictionaries with key as the record key and value as the record
3. This gives us 2 dictionaries dictA, dictB
4. Find the keys common between 2 dicts
5. Find the records that are exclusive to one file and write those to report files if needed
'''
timestamp('Process Start')
config = parse_config(configfile)
diff_count = defaultdict(int) # To store counts of various mismatched fields
try:
file_fields = [ line.strip() for line in open(config['metafile']).readlines() ]
except KeyError:
file_fields = None
rec_matched = 0
diff_samples = []
if 'fileADel' in config and 'fileBDel' in config:
fileA = csv.reader(open(config['fileA'], 'rb'), delimiter=config['fileADel'])
fileB = csv.reader(open(config['fileB'], 'rb'), delimiter=config['fileBDel'])
t = show_progress(1,6, prefix='Initial Setup', suffix='Scanning FileA')
dictA = OrderedDict( (get_key(x, config['keyfields']), x) for _, x in enumerate(fileA) if _ >= int(config['skipRecs']) )
t = show_progress(2,6, prefix='Initial Setup', suffix='Scanning FileB', clrlen=t)
dictB = OrderedDict( (get_key(x, config['keyfields']), x) for _, x in enumerate(fileB) if _ >= int(config['skipRecs']) )
t = show_progress(3,6, prefix='Initial Setup', suffix='Finding Common keys', clrlen=t)
elif 'colwidths' in config:
with open(config['fileA'], 'rb') as fileA, open(config['fileB'], 'rb') as fileB:
lengths = [ int(x) for x in config['colwidths'].split(',') ]
t = show_progress(1,6, prefix='Initial Setup', suffix='Scanning FileA')
dictA = OrderedDict( (get_key(tuple(delimit(x,lengths)), config['keyfields']), tuple(delimit(x,lengths))) for _, x in enumerate(fileA) if _ >= int(config['skipRecs']) )
t = show_progress(2,6, prefix='Initial Setup', suffix='Scanning FileB', clrlen=t)
dictB = OrderedDict( (get_key(tuple(delimit(x,lengths)), config['keyfields']), tuple(delimit(x,lengths))) for _, x in enumerate(fileB) if _ >= int(config['skipRecs']) )
t = show_progress(3,6, prefix='Initial Setup', suffix='Finding Common keys', clrlen=t)
else:
print ('Missing Delimiter or column with information in config file. Aborting!')
sys.exit(1)
common_keys = set(dictA.keys()).intersection(set(dictB.keys()))
t = show_progress(4,6, prefix='Initial Setup', suffix='Finding Aonly recs', clrlen=t)
aonly_keys = set(dictA.keys()) - set(dictB.keys())
if 'fileAOnly' in config:
write_exclusive_recs(config, config['fileAOnly'], dictA, aonly_keys, config.get('fileADel', ''))
t = show_progress(5,6, prefix='Initial Setup', suffix='Finding Bonly recs', clrlen=t)
bonly_keys = set(dictB.keys()) - set(dictA.keys())
if 'fileBOnly' in config:
write_exclusive_recs(config, config['fileBOnly'], dictB, bonly_keys, config.get('fileBDel', ''))
show_progress(6,6, prefix='Initial Setup', suffix='Initial setup complete', clrlen=t)
timestamp("End Initial Setup & File Read")
l = len(common_keys)
pct = l/100.0
for (i,k) in enumerate(common_keys,start=1):
if i % pct == 0:
show_progress(i,l, prefix='Comparision Progress', suffix=str(i) )
try:
if 'ignorefields' in config:
diffs = get_diff(dictA[k], dictB[k], config['keyfields'], config['ignorefields'])
else:
diffs = get_diff(dictA[k], dictB[k], config['keyfields'])
except ValueError:
print ("LengthMismatch in line number", i)
continue
except KeyError:
print ("keyMismatch in line number", i)
else:
num_fields = len(dictA)
if diffs:
for diff in diffs:
diff_count[diff[0]] += 1
if 'keyMismatchThreshold' in config:
if diff_count[diff[0]] <= int(config['keyMismatchThreshold']):
diff_samples.append((i,diff))
else:
diff_samples.append((i,diff))
else:
rec_matched += 1
show_progress(i,l, prefix='Comparision Progress', suffix="Done")
timestamp('End Comparision')
if not file_fields:
file_fields = ['Column ' + str(x + 1) for x in xrange(num_fields)]
rows_cmprd = i - ( int(config['skipRecs']) or 0 )
with open(config['reportfile'],'wb') as rptfile:
rptwriter = csv.writer(rptfile, dialect='excel')
rptwriter.writerow(['fileA', config['fileA'], len(dictA)])
rptwriter.writerow(['fileB', config['fileB'], len(dictB)])
rptwriter.writerow(['Number of recs exclusive to FileA', len(aonly_keys)])
rptwriter.writerow(['Number of recs exclusive to FileB', len(bonly_keys)])
if 'fileAOnly' in config:
if aonly_keys:
rptwriter.writerow( ['FileA Only recs written to: ', config['fileAOnly']])
if 'fileBOnly' in config:
if bonly_keys:
rptwriter.writerow( ['FileB Only recs written to: ', config['fileBOnly']])
rptwriter.writerow(["Rows compared", len(common_keys)])
rptwriter.writerow(["Rows matched", rec_matched])
rptwriter.writerow(["Rows mismatched", len(common_keys) - rec_matched])
rptwriter.writerow([])
l = len(diff_samples)
if l:
rptwriter.writerow(["Data Element mismatched stats:"])
rptwriter.writerow(["Field Name", 'Diff Count'])
for key in diff_count:
rptwriter.writerow( [ file_fields[key], diff_count[key]])
rptwriter.writerow([])
rptwriter.writerow(["Sample differences:"])
rptwriter.writerow(["Line#"] + list(get_key(file_fields, config['keyfields'])) + ["Field Name", "FileA Value", "FileB Value"])
pct = l/100.0
for i,d in enumerate(diff_samples, start=1):
if i % pct == 0:
show_progress(i,l, prefix='Creating Report', suffix=str(i) )
row = [d[0]] + list(d[1][3]) + [file_fields[d[1][0]], d[1][1], d[1][2]]
rptwriter.writerow(row)
show_progress(i,l, prefix='Creating Difference Report', suffix="Done")
timestamp('End Report Generation')
if l + len(aonly_keys) + len(bonly_keys) == 0:
print ("\nNo Differences found!")
else:
print ("\nDifferences found refer to %s for details"%config['reportfile'] )
print ("\nComparision Complete!")
if __name__ == "__main__":
""" argv[1] is the full name of the config file"""
main(sys.argv[1])