-
Notifications
You must be signed in to change notification settings - Fork 150
/
unshuffle.py
92 lines (63 loc) · 1.73 KB
/
unshuffle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
Unshuffle previously shuffled file
unshuffle.py input_file.csv output_file.csv <max. lines in memory> <random seed>
"""
import sys
import random
input_file = sys.argv[1]
output_file = sys.argv[2]
try:
lines_in_memory = int( sys.argv[3] )
except IndexError:
lines_in_memory = 100000
print "caching %s lines at a time..." % ( lines_in_memory )
try:
random_seed = sys.argv[4]
random.seed( random_seed )
print "random seed: %s" % ( random_seed )
except IndexError:
print "need a seed..."
sys.exit( 1 )
# first count
print "counting lines..."
f = open( input_file )
count = 0
for line in f:
count += 1
if count % 100000 == 0:
print count
print count
# then shuffle
print "(un)shuffling..."
o_f = open( output_file, 'wb' )
order = range( count )
random.shuffle( order )
# un-shuffle
order_dict = { shuf_i: orig_i for shuf_i, orig_i in enumerate( order ) }
# sort by original key asc, will get shuffled keys in the right order to unshuffle
order = sorted( order_dict, key = order_dict.get )
epoch = 0
while order:
current_lines = {}
current_lines_count = 0
current_chunk = order[:lines_in_memory]
current_chunk_dict = { x: 1 for x in current_chunk } # faster "in"
current_chunk_length = len( current_chunk )
order = order[lines_in_memory:]
f.seek( 0 )
count = 0
for line in f:
if count in current_chunk_dict:
current_lines[count] = line
current_lines_count += 1
if current_lines_count == current_chunk_length:
break
count += 1
if count % 100000 == 0:
print count
print "writing..."
for l in current_chunk:
o_f.write( current_lines[l] )
lines_saved = current_chunk_length + epoch * lines_in_memory
epoch += 1
print "pass %s complete (%s lines saved)" % ( epoch, lines_saved )