-
Notifications
You must be signed in to change notification settings - Fork 0
/
deduplicate_books.rb
executable file
·43 lines (36 loc) · 1.2 KB
/
deduplicate_books.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/env ruby
# 1. Removes the rows that do not match their 図書カードURL in 作品ID and 人物ID
# 2. Removes the rows with the same 'text' (first ones will be alive)
require 'json'
require 'optparse'
args = {
in: '/dev/stdin',
out: '/dev/stdout'
}
OptionParser.new do |opt|
opt.on('--in FILE') { |v| args[:in] = v }
opt.on('--out FILE') { |v| args[:out] = v }
opt.parse!
end
text_to_meta = {}
File.open(args[:out], 'w') do |f|
File.foreach(args[:in]).with_index do |line, i|
row = JSON.load(line)
meta = row['meta']
card_url = meta['図書カードURL']
match = card_url.match(/cards\/(\d+)\/card(\d+).html/)
card_author_id = match[1]
card_book_id = match[2]
if meta['作品ID'].end_with?(card_book_id) && meta['人物ID'] == card_author_id
if text_to_meta.key? row['text']
STDERR.puts "ignoring #{meta['作品ID']}:#{meta['作品名']} because the 'text' field is duplicated"
else
text_to_meta[row['text']] = meta
f.puts line
end
else
STDERR.puts "ignoring #{meta['作品ID']}:#{meta['作品名']} by #{meta['人物ID']}:#{meta['姓']}#{meta['名']} (#{card_url})"
end
STDERR.write "Progress: #{i}\r"
end
end