-
Notifications
You must be signed in to change notification settings - Fork 0
/
TrainDocuments.pl
85 lines (73 loc) · 3.18 KB
/
TrainDocuments.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/perl/
# Written by M. Paramita ([email protected])
# Last update 16 August 2011
use List::Util 'max';
my $temp = "@ARGV";
if (($temp =~ m/--input/) && ($temp =~ m/--model/) && ($temp =~ m/--source/) && ($temp =~ m/--target/) &&
($temp =~ m/--param/)) {
}
else
{
print "Missing parameter. Please run the tool based on the guideline below.\n\n";
print "-------------------------------------------------------------------------------------------------------\n";
print "To run this script, please use the following arguments:\n\n";
print " \"perl TrainDocuments.pl --source [sourceLang] --target [targetLang] --input [featuresFile] \n";
print " --model [outputModelFolder] --param \"mapping=[space-separated class mapping]\"\n\n";
print "An example of use is:\n\n";
print " \"perl TrainDocuments.pl --source HR --target EN --input C:\\ACCURAT\\HR-EN-summary.txt\n";
print " --model C:\\ACCURAT\\model --param \"mapping=1 0 0 0 2 3 4\"\"\n";
print "-------------------------------------------------------------------------------------------------------\n";
exit;
}
my ($sourceLang, $targetLang, $inputFile, $outputFile, $param);
my @mapping;
my $index = 0;
for (my $i=0; $i < scalar @ARGV; $i = $i+2) {
if ($ARGV[$i] eq "--source") {
$sourceLang = $ARGV[$i+1];
}
elsif ($ARGV[$i] eq "--target") {
$targetLang = $ARGV[$i+1];
}
elsif ($ARGV[$i] eq "--input") {
$inputFile = $ARGV[$i+1];
}
elsif ($ARGV[$i] eq "--model") {
$outputFolder = $ARGV[$i+1];
mkdir $outputFolder unless (-d $outputFolder);
}
elsif ($ARGV[$i] eq "--param") {
$param = $ARGV[$i+1];
if ($param =~ m/mapping/) {
my @temp = split(/=/, $param);
@mapping = split(/ /, $temp[1]);
}
else {
}
}
else {
print "Format $ARGV[$i] is not recognized. Please correct the format and restart the tool.\n";
exit();
}
}
my $mf = join('',@mapping);
#choose a subset for validation, save as $inputFile_valid.txt
my $validationFile = $inputFile;
$validationFile =~ s/\.txt/_valid\.txt/;
#use the rest as training data, save as $inputFile_train.txt
my $trainingFile = $inputFile;
$trainingFile =~ s/\.txt/_train\.txt/;
print "perl SplitPairs.pl $inputFile $validationFile $trainingFile\n";
system("perl SplitPairs.pl $inputFile $validationFile $trainingFile");
my $processedValidationFile = $validationFile;
$processedValidationFile =~ s/\.txt/_mapped_F$mf\.txt/;
my $processedTrainingFile = $trainingFile;
$processedTrainingFile =~ s/\.txt/_mapped_F$mf\.txt/;
print "perl Process.pl $trainingFile $processedTrainingFile @mapping\n";
system("perl Process.pl $trainingFile $processedTrainingFile @mapping");
print "perl Process.pl $validationFile $processedValidationFile @mapping\n";
system("perl Process.pl $validationFile $processedValidationFile @mapping");
my $numberOfClasses = max(@mapping);
print $numberOfClasses . "\n";
print "perl Ecoc_train.pl $sourceLang $targetLang $numberOfClasses $processedTrainingFile $processedValidationFile $outputFolder @mapping\n";
system("perl Ecoc_train.pl $sourceLang $targetLang $numberOfClasses $processedTrainingFile $processedValidationFile $outputFolder @mapping");