#!/usr/local/gnu/bin/perl
#use strict;

my $copy  = "ln";

#-------------------------------------------------------------------------------
# Synopsis:
#    shuffleM.pl [ < -v > ] [-s seed ]
#	[ -e epoch_number ] [ -m minimum_per_class ]
#	<perc> <file> <directory>
#
# Description:
#	Copy example list read from "file" to train and test files,
#	   in "directory". X docs are copied to the
#	   training list, the other docs to the test list.
#	X is a global percentage, a percentage per class
#	  or the same number in each class, depending on the arguments
#	with -e several training epochs are put in train1 train2 ...
# Example:
#	shuffleM.pl -v 80 doc-list ohra
#	shuffleM.pl -e 10 -n 80 doc-list ohra
#
# Example input:
#	0000001255.txt eb/coa xx/test
#	0000001257.txt eb/coa
#	0000001257.txt xx/test
#	0000001261.txt eb/coa
#	0000001262.txt eb/coa
#-------------------------------------------------------------------------------

# Check arguments

my $verbose = 0;
my $epochs=1;
my $minimum=0;
my $seed;
my $hasseed=0;

if ($ARGV[0] eq "-v")
{ $verbose ++;
  shift;
}
if ($ARGV[0] eq "-vv")
{ $verbose += 2;
  shift;
}
if ($ARGV[0] eq "-s")
{ shift;
  $hasseed=1;
  $seed=shift;
}
if ($ARGV[0] eq "-e")
{ shift;
  $epochs=$ARGV[0];
  if($epochs<2)
  { print STDERR "The number of epochs should be at least 2\n";
    exit 1;
  }
  shift;
}
if ($ARGV[0] eq "-m")
{ shift;
  $minimum=$ARGV[0];
  if($minimum<1)
  { print STDERR "The minimum should be at positive\n";
    exit 1;
  }
  shift;
}

if (@ARGV != 3)
{ print STDERR "Usage: shuffleM.pl [ -v ] [-s seed ]"; 
  print STDERR " [ -e epoch_number ] [ -m minimum_per_class ]\n";
  print STDERR "	<perc> <parts_file> <directory>\n";
  exit 1;
}
my $percentage = $ARGV[0];
my $file = $ARGV[1];
my $directory = $ARGV[2];
if (($percentage == 0) && ($percentage ne "0"))
  { die "error: not a number `$percentage'\n"; }
check_file_exists($file);
check_dir_exists($directory);

# Read examples, and store them in array

my $nr_examples = 0;
my $line = 0;
my @examples;
my %examples;
my %classes;

open(IN, "$file");
while (<IN>)
{ $line++;
  if ( /(.+\.txt)\s+(.+)/ )
  { my $doc=$1;
    my $classes=$2;
    if(! $examples{$doc})
      { $examples[$nr_examples++] = $doc; }
    while ( $classes =~ /./)
    { if ( $classes =~ /(\S+)\s+(.+)/ )
      { $classes{$1}++;
        $classes=$2;
        push @{$examples{$doc}}, $1;
      }
      elsif ( $classes =~ /(\S+)/ )
      { $classes{$1}++;
        $classes="";
        push @{$examples{$doc}}, $1;
      }
      else
        { die "line $line: syntax error: $classes\n"; }
  } }
  else
    { die "line $line: syntax error\n"; }
}
close(IN);

my $skipped=0;
open(CLASSES,">$directory/classes");
my @classes=sort { $b cmp $a } keys %classes;
while(@classes)
  { my $cl=pop(@classes);
    if($classes{$cl}<$minimum)
      { $skipped++; }
    print CLASSES "$cl $classes{$cl}\n"; }
close(CLASSES);
if($skipped)
  { print "$skipped classes skipped\n"; }

if($hasseed)
  { srand($seed); }
else
  { srand(time()^($$+($$<<15))); }
  
open(TEST,">$directory/test");
if($epochs==1)
  { open(TRAIN,">$directory/train"); }
else
  { for(my $x=1;$x<=$epochs;$x++)
    { if ($verbose)
      { print STDERR "opening $directory/train$x\n"; }
      open "TRAIN$x",">$directory/train$x"; 
    }
  }

# globally shuffle examples and copy them to test and train files
$skipped=0;
shuffle(0,$nr_examples);
my $nr_train = int($percentage *  $nr_examples / 100);
copies_train(0,$nr_train);
copies_test($nr_train,$nr_examples);
if($skipped)
  { print "$skipped documents skipped\n"; }

if ($verbose)
{ print STDERR "copied $nr_examples documents\n";
  print STDERR "selected $nr_train documents for training\n";
}

# Sub-routines

sub shuffle
{ my $begin=shift(@_);
  my $nr=shift(@_);
  if ($verbose)
    { print STDERR "shuffle:$begin+$nr\n"; }
  for (my $i = 0; $i < $nr; $i++)
  { my $tmp = $examples[$begin+$i];
    my $r = int(rand($i + 1)) + $begin;
    $examples[$begin+$i] = $examples[$r];
    $examples[$r] = $tmp;
  }
}

sub copies_train
{ my $begin=shift(@_);
  my $nb=shift(@_);
  my $end=$begin+$nb;
  if($epochs==1)
  { if ($verbose)
      { print STDERR "copies:$nb in train\n"; }
    for (my $i = $begin; $i < $end; $i++)
      { copy_one_doc($i,"TRAIN"); }
  }
  else
  { my $nbp=int($nb/$epochs);
    my $j=$begin;
    for(my $ne=1; $ne<=$epochs; $ne++)
    { if ($verbose)
        { print STDERR "copies:$nbp in train$ne\n"; }
      for (my $i = 0; $i < $nbp; $i++)
      { copy_one_doc($j,"TRAIN$ne");
#        print {"TRAIN$ne"} "$examples[$j] $examples{$examples[$j]}\n";
	$j++;
    } }
} }

sub copies_test
{ my $begin=shift(@_);
  my $end=shift(@_);
  if ($verbose)
    { print STDERR "copies:$begin-$end in test\n"; }
  for (my $i = $begin; $i < $end; $i++)
    { copy_one_doc($i,"TEST"); }
#    print TEST "$examples[$i] $examples{$examples[$i]}\n";
}  

sub copy_one_doc
{ my $ne= shift(@_);
  my $file=shift(@_);
  my $first=1;
  my @docclass=@{$examples{$examples[$ne]}};
  foreach my $i ( 0 .. $#{@docclass} )
  { my $cl=$docclass[$i];
    if($classes{$cl}>=$minimum)
    { if($first)
      { print {$file} $examples[$ne];
	$first=0; }
      print {$file} " $cl";
  } }
  if($first)
  { # print "document skiped: $examples[$ne] $docclass[$i]\n";
    $skipped++;
  }
  else
    { print {$file} "\n"; }
}

sub check_file_exists
{ my($file) = @_;
  if (! -f $file)
    { die "error: file `$file' does not exist\n"; }
}

sub check_dir_exists
{ my($dir) = @_;
  if (! -d $dir)
  { die "error: directory `$dir' does not exist\n"; }
}
