-
Notifications
You must be signed in to change notification settings - Fork 2
/
directoryToTypeTokenCounts.pl
78 lines (60 loc) · 2.63 KB
/
directoryToTypeTokenCounts.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/perl
# convert directory full of data to token count, type,
# and an arbitrary literal
# list contents of a directory
# for each file in the directory...
# for each line in the file...
# for each word in the line...
# normalize case, normalize punctuation, count
# output words ordered by token count and then alphabetically
# authors: Kevin Cohen and Foster Goss 303-916-2417
# catch typos
use strict "vars";
my %tokenCounts;
# this gets appended to every line--use it to label which corpus
# your data came from
#my $literal_string = "LITERAL";
my $literal_string = "CRAFT";
#my $literal_string = "MIMIC2MD";
# list contents of directory
#my $directory_name = "/Users/kev/Dropbox/Negation/Code";
#my $directory_name = "/Users/kev/Dropbox/A-M/Corpora/MIMIC2/negationdist/";
my $directory_name = "/Users/kev/Dropbox/A-M/Corpora/craft-1.0/articles/txt/";
#my $directory_name = "/Users/kev/Dropbox/A-M/Corpora/MIMIC2/md/";
opendir(DIR, $directory_name) || die "Couldn't open directory $directory_name: $!\n";
my @directory_contents = readdir(DIR);
#my $file_name = "/Users/kev/Dropbox/Negation/Code/md.text.txt";
#my $file_name = "/Users/kev/Dropbox/A-M/Corpora/MIMIC2/md/kev_clinical_md.txt";
while (my $file_name = pop(@directory_contents)) {
# filter file names--check to make sure that this/these work for you
unless ($file_name =~ /txt$/) { next; }
$file_name = $directory_name . $file_name;
open(IN, $file_name) || die "Couldn't open input file $file_name\n";
while (my $line = <IN>) {
#print $line;
chomp($line);
my @words = split(" ", $line);
for (my $i; $i < @words; $i = $i + 1) {
#print "$words[$i]\n";
my $normalized_word = $words[$i];
$normalized_word = lc($normalized_word);
#print "$normalized_word\n";
while ($normalized_word =~ /[\W]/) {
$normalized_word =~ s/[\W]+//;
}
#print "$normalized_word\n";
# bug fix: this was outputing an empty string in the case
# where the input was all punctuation
if ($normalized_word =~ /./) {
$tokenCounts{$normalized_word}++;
}
} # close for-loop through line
} # close while-loop through file
} # close while-loop through directory contents
#my @normalized_words = sort {($tokenCounts{$b} cmp $tokenCounts{$a}) || ($a cmp $b)} keys(%tokenCounts);
my @normalized_words = sort {($tokenCounts{$b} <=> $tokenCounts{$a}) || ($a cmp $b)} keys(%tokenCounts);
for (my $i = 0; $i < @normalized_words; $i++) {
#print "$normalized_words[$i]\n";
#print "$tokenCounts{$normalized_words[$i]}\n";
print "$tokenCounts{$normalized_words[$i]},$normalized_words[$i],$literal_string\n";
}