#!/usr/bin/perl
#                              -*- Mode: Perl -*-
# dirsplit ---
# Author           : Eduard Bloch ( blade@debian.org )
# Last Modified On : Sun, 06 Feb 2005 14:59:51 +0100
# Status           : Working, but use with caution!
# License: GPLv2

my $version="0.3.3";

require v5.8.1;
use strict;
use List::Util 'shuffle';
use Getopt::Long qw(:config no_ignore_case bundling);
use File::Basename;
use File::Path;
use Cwd 'abs_path';

my $ret=0;
my $max="4488M";
my $prefix="vol_";
my $acc=20;
my $emode=1;
my $bsize=2048;
my $ofac =50;
my $opt_help;
my $opt_longhelp;
my $opt_sim;
my $opt_dir;
my $opt_flat;
my $opt_move;
my $opt_ver;
my $opt_sln;
my $opt_ln;
my $opt_filter;
my $opt_simple;
my $opt_follow;
my $get_ver;
my $opt_listfile;


my %options = (
   "h|help"                => \$opt_help,
   "d|dirhier"            => \$opt_dir,
   "flat"            => \$opt_flat,
   "f|filter=s"            => \$opt_filter,
   "F|follow"            => \$opt_follow,
   "e|expmode=i"            => \$emode,
   "o|overhead=i"            => \$ofac,
   "b|blksize=i"            => \$bsize,
   "n|no-act"            => \$opt_sim,
   "m|move"            => \$opt_move,
   "l|symlink"            => \$opt_sln,
   "L|hardlink"           => \$opt_ln,
   "v|verbose"            => \$opt_ver,
   "s|size=s"             => \$max,
   "S|simple"             => \$opt_simple,
   "T|input=s"       => \$opt_listfile,
   "p|prefix=s"              => \$prefix,
   "a|accuracy=i"            => \$acc,
   "H|longhelp"            => \$opt_longhelp,
   "version"                 => \$get_ver
);

&show_help(1) unless ( GetOptions(%options));
&show_help(1) if $opt_help;
&show_longhelp if $opt_longhelp;
if($get_ver) {
   print $version;
   exit 0;
}

# ignore the old dirhier setting since it is default now and disable the flag when opt_flat is specified
$opt_dir = !$opt_flat;

$opt_ver = 1 if $opt_sim;
$opt_move=1 if ($opt_sln || $opt_ln);

# big list @sizes containing the "items" (object sizes)
# %names hash mapping "items" (size as key) to arrays with filenames/subarrays for coalesced files
my @sizes;
my %names;

# result containts the calculated output. In simple mode, an
# array (bins) of atoms (files or filelists). Otherwise, sizes
# instead of atoms, to be resolved with %names.
my @result;

my $inputdir;

$max=fixnr($max);
# about 400kB for iso headers
$max-=420000;

# init default value
my $globwaste=0;


if(-d $ARGV[0] || (-d readlink($ARGV[0]))) {
   syswrite(STDOUT,"Building file list, please wait...\n");
   # save the absolut path before doing anyhting
   $inputdir=Cwd::abs_path($ARGV[0]);
   &explore($inputdir);
}
elsif($opt_listfile) {
   if($opt_listfile eq "-") {
      &parseListe(\*STDIN);
   }
   else {
      open(my $in, "<", $opt_listfile) || die "Cannot open list file $opt_listfile\n";
      &parseListe($in);
   }
}
else {
   die "Error: please specify a directory\n";
}

# check for pointless requests
my $testsize=0;
for(@sizes) {
   die "Too large object(s) ($_) for the given max size: @{$names{$_}} (maybe coalesced in arrays, check manually)\n" if($_>$max);

   $testsize+=$_;
}

$acc=1 if ($testsize <= $max); # just generate a list, more trials are pointless
print "\nSumm: $testsize\n" if($opt_ver);
die "Nothing to do!\n" if($testsize<4096); # looks like just an empty dir

if(!$opt_simple) {
   syswrite(STDOUT, "Calculating, please wait...\n");
   my $starttime=time;
   $globwaste=$max*@sizes;
   for(1..$acc) {
      syswrite(STDOUT,".");
      my @tmp;
      #my $waste = bp_bestfit($max, \@in, \@tmp);
      my $waste = bp_firstfit($max, \@sizes, \@tmp);
      #print "D: waste - $waste\n";
      if($waste < $globwaste) {
         $globwaste=$waste;
         @result=@tmp;
      }
      if($starttime && time > $starttime+10) {
         syswrite(STDOUT,"\nSpent already over 10s (for $_ iterations)\nHint: reduce accuracy to make it faster!\n");
         undef $starttime;
      }
      @sizes=shuffle(@sizes);
   }

}

print "\nCalculated, using ".(scalar @result)." volumes.\n";
print "Wasted: $globwaste Byte (estimated, check mkisofs -print-size ...)\n";

# and the real work
my $i=0;
my $inDirLen=length($inputdir);
for(@result) {
   $i++;
   my $o;
   open($o, ">$prefix$i.list") if(! ($opt_move || $opt_sim));
   my $dirPrefix=dirname($prefix);
   my $prefixBase=basename($prefix);
   my $dirPrefixAbs=Cwd::abs_path($dirPrefix);

   for(@{$_}) {
      my $stuffRef;
      
      # For simple mode, the files/atoms are already resolved, otherwise take
      # the next with appropriate size. 
      my $item= $opt_simple ? $_ : shift(@{$names{$_}});

      # make reference point to an array with our files, create a list if needed
      if(ref($item) eq "ARRAY") {
         $stuffRef=$item;
      }
      else {
         $stuffRef=[$item];
      }

      for my $file (@$stuffRef) {
         my $relFile=substr($file,$inDirLen+1);
         my $base=basename($relFile);
         if($opt_move) {
            my $targetsubdir = $dirPrefixAbs."/$prefixBase$i";
            $targetsubdir .= "/".dirname($relFile) if($opt_dir);
            print "$file -> $targetsubdir/$base\n" if($opt_ver);
            if(!$opt_sim) {
               mkpath $targetsubdir || die "Problems creating $targetsubdir\n";
               # last check
               die "Could not create $targetsubdir?\n" if(!(-d $targetsubdir && -w $targetsubdir));
               if($opt_sln) {
                  symlink($file, "$targetsubdir/$base");
               }
               elsif($opt_ln) {
                  if(-d $file && !-l $file) {
                     mkdir "$targetsubdir/$base";
                  }
                  else {
                     link($file, "$targetsubdir/$base");
                  }
               }
               else {
                  rename($file, "$targetsubdir/$base");
               }
            }
         }
         else {
            # escape = in mkisofs catalogs, they are used as separator
            my $isoname = ($opt_dir?$relFile : $base);
            $isoname=~s/=/\\=/g;
            my $sourcefile=$file;
            $sourcefile=~s/=/\\=/g;
            print "$i: /$isoname=$sourcefile\n" if $opt_ver;
            print $o "/$isoname=$sourcefile\n" if(!$opt_sim);
         }
      }
   }
   close($o) if($o);
}

exit $ret;


# recursive function
# parameter: directory
# mode 1: descend as far as possible and index all non-directories
# mode 2++:
# put all files of a dir into coaleseced-object, then descend into each dir
sub explore {
   (my $dir) = @_;
   my @stuff;
   my @dirs;
   my @files;

   opendir(DIR, $dir) || die "Could not open $dir\n";
   @stuff=readdir(DIR);
   
   if($opt_simple) {
      @stuff=sort { lc($a) cmp lc($b) } @stuff;
   }
      
   foreach my $f (@stuff) {
      next if ($f eq "." || $f eq "..");
      #print "\$f=$opt_filter;\n";
      
      $f="$dir/$f" if($dir ne ".");

      if ($opt_filter) {
         next unless (eval("\$f=~$opt_filter;"));
      }

      if(-l $f && ! $opt_follow) {
         push(@files, $f);
      }
      elsif(-d $f) {
         push(@dirs, $f);
      }
      else {
         push(@files, $f);
      }
   }
   closedir(DIR);

   if( (@dirs + @files) == 0 ) {
      # this one is empty, register for cosmetics reason
      &insitem(getsize($dir), $dir);
      return;
   }
   
   # recurse on directories
   &explore($_) for(@dirs);

   # and now process files
   if($emode==1) {
      &insitem(getsize($_), $_) for(@files);
   }
   else {
      # handle coalesced objects - first some sanity checks and splitting if
      # required

      my $filesum=0;
      for(@files) {
         my $tmp=getsize($_);
         if($tmp>$max) {
            # already too large, stop right here
            die "Too large file ($_) for the given max size $max, aborting...\n";
         }
         $filesum += $tmp;
      };

      # handle coal. objects becoming too large
      if($filesum>$max) {
         # too large coal. object...
         if($emode==3) {
            # don't coalesc in this mode, do like mode 1 above, leave them alone
            &insitem(getsize($_), $_) for(@files);
            return;
         }
         # a bit complicated, split file set while creating coal.objects
         if($emode==4) {
            my $partsum=0;
            my @sorted=sort(@files);
            my @tmpvol;
            for(my $i=0;$i<=$#sorted;$i++) {
#            print "D: i: $i, partsum: $partsum, file: $sorted[$i]\n";
               my $tmp=getsize($sorted[$i]);
               $partsum+=$tmp;
               if($partsum>$max) {
                  # undo the last step then build the coal.object
                  $partsum-=$tmp;
                  $i--;

                  &insitem($partsum, \@tmpvol);
                  # reset temporaries
                  undef @tmpvol;
                  undef $partsum;
               }
               else {
                  push(@tmpvol, $sorted[$i]);
               }
            }
            return;
         }
      }

      # ok, building a coalesced object for simple cases
      if($filesum) {
         &insitem($filesum, \@files);
      }
   }
}

my $simplePos=0;
my @simpleBinSizes;

# args: size, object (filename or list reference)
sub insitem {
   my ($size, $object) = @_;
   # normaly, put the items into the pool for calculation. In simple mode, calculate here
   
   push(@sizes, $size);
   push(@{$names{$size}},$object);

   if($opt_simple) {
      # now the simplest method to fill the bins, just take a new one when the
      # object-to-be-added no longer fits
      if($simpleBinSizes[$simplePos]+$size > $max) {
         $globwaste += ( $max-$simpleBinSizes[$simplePos] );
         $simplePos++;
      };
      $simpleBinSizes[$simplePos]+=$size;
      push( @{$result[$simplePos]}, $object);
   }
   
}

sub getsize {
   (my $file) = @_;
   my $size = ((stat($file))[7]);
   my $rest = ($size % $bsize);
   $size = ($size + $bsize - $rest) if ($rest);
   return 1+int(200 + $ofac*length(basename($file)) + $size);
}
   
sub parseListe {
   my $fh=${$_[0]};
   while(<$fh>) {
      if(/^(\w+)\s+(.+)/) {
         &insitem(fixnr($1), $2);
      }
   }
}

sub fixnr {
   # args: 
   # Number
   # optional: default multiplier
   my $fac;
   my $nr;
   if($_[0]=~/(\d+)(\D)/) {
      $nr=$1;
      $fac=$2;
   }
   elsif(defined($_[1])) {
      $nr=$_[0];
      $fac=$_[1];
   }
   else {
      return $_[0];
   }
   return $nr*1000000000 if($fac eq "g");
   return $nr*1073741824 if($fac eq "G");
   return $nr*1000000 if($fac eq "m");
   return $nr*1048576 if($fac eq "M");
   return $nr*1000 if($fac eq "k");
   return $nr*1024 if($fac eq "K");
   return $nr if($fac eq "b");
   die "$fac is not a valid multiplier!";
}


sub show_help {
   print <<EOM
dirsplit [options] [advanced options] < directory >

 -H|--longhelp Show the long help message with more advanced options
 -n|--no-act   Only print the commands, no action (implies -v)
 -s|--size     NUMBER - Size of the medium (default: $max)
 -e|--expmode  NUMBER - directory exploration mode (recommended, see long help)
 -m|--move     Move files to target dirs (default: create mkisofs catalogs)
 -p|--prefix   STRING - first part of catalog/directory name (default: vol_)
 -h|--help     Show this option summary
 -v|--verbose  More verbosity
                   
The complete help can be displayed with the --longhelp (-H) option.
The default mode is creating file catalogs useable with:
    mkisofs -D -r --joliet-long -graft-points -path-list CATALOG

Example:
dirsplit -m -s 700M -e2 random_data_to_backup/
EOM
   ;
   exit shift;
}

sub show_longhelp {
   my $msglong="
dirsplit [options] [advanced options] < directory >
 -n|--no-act   Only print the commands, no action (implies -v)
 -s|--size     NUMBER - Size of the medium (default: $max)
 -m|--move     Move files to target dirs (default: create mkisofs catalogs)
 -l|--symlink  similar to -m but just creates symlinks in the target dirs
 -L|--hardlink like -l but creates hardlinks
 -p|--prefix   STRING - first part of catalog/directory name (default: vol_)
 -f|--filter   EXPR - Filter expression, see examples below and perlre manpage
 --flat        Flat dir mode, don't recreate subdirectory structure (not recommended)
 -e|--expmode  NUMBER, special exploration modes, used with directory argument

  1: (default) native exploration of the specified directory, but file sizes
               are rounded up to 2048 blocks plus estimated overhead for
               filenames (see -o option)
  2: like 1, but all files in directory are put together (as \"atom\") onto the
               same medium. This does not apply to subdirectories, however.
  3: like 2, but don't coalesc files when the size of the \"atom\" becomes too
               large for the medium size (currently $max)
  4: like 2, but the max. size of the atoms is limited to $max (storing the
               rest on another medium)

 -F|--follow   Follow symlinks. Use with care!
 -b|--blksize  NUMBER, block size of the target filesystem (currently $bsize).
 -o|--overhead NUMBER, overhead caused by directory entries (as factor for the
               filename length, default: 50, empiricaly found for Joliet+RR
               with not-so-deep directory structure). Works in exploration
               mode.
 -a|--accuracy NUMBER (1=faster, large number=better efficiency, default: 500)
 -S|--simple   Simple/stupid/alphabetic mode
 -T|--input    FILENAME (or - for STDIN):  List with sizes and paths, try:
               find dir -type f -printf \"%s %p\n\"
               to get an example. Avoid duplicates! Unit suffixes are allowed.
 -h|--help     Show this option summary
 -v|--verbose  More verbosity
                   
File sizes are expected to be in bytes, append modifier letters to multiply
with a factor, eg 200M (b,k,K,m,M,g,G for Bytes, Kb, KiB, Mb, MiB, Gb, GiB).
The default output mode is creating file catalogs useable with
    mkisofs -D -r --joliet-long -graft-points -path-list CATALOG

Examples:
dirsplit -m -s 120M -e4 largedirwithdata/ -p /zipmedia/backup_   #move stuff into splitted backup dirs
dirsplit -s 700M -e2 music/ # make mkisofs catalogs to burn all music to 700M CDRs, keep single files in each dir together
dirsplit -s 700M -e2 -f '/other\\/Soundtracks/' music/ # like above, only take files from other/Soundtracks
dirsplit -s 700M -e2 -f '!/Thumbs.db|Desktop.ini|\\.m3u\$/i' # like above, ignore some junk files and playlists, both letter cases

Bugs: overhead trough blocksize alignment and directory entry storage varies,
heavily depends on the target filesystem and configuration (see -b and -o).

You should compare the required size of the created catalogs, eg.:
for x in *list ; do mkisofs -quiet -D -r --joliet-long -graft-points \\
 -path-list \$x -print-size; done
(output in blocks of 2048 bytes) with the expected size (-s) and media data
(cdrecord -v -toc ...). 
";
   print $msglong;
   exit 0;
}

# Parms: bin size (int), input array (arr reference), output array (arr reference)
# Returns: wasted space (int)
sub bp_bestfit {
   my $max=$_[0];
   my @in = @{$_[1]};
   my $target = $_[2];
   my @out;
   my @bel;

   my @tmp;
   push(@tmp,$in[0]);
   push(@out, \@tmp);
   $bel[0] = $in[0];
   shift @in;

   for(@in) {
      my $bestplace=$#out+1;
      my $bestwert=$max;
      for($i=0;$i<=$#out;$i++) {
         my $rest;
         $rest=$max-$bel[$i]-$_;
         if($rest>0 && $rest < $bestwert) {
            $bestplace=$i;
            $bestwert=$rest;
         };
      }
      if($bestplace>$#out) {
         my @bin;
         $bel[$bestplace]=$_;
         push(@bin, $_);
         push(@out,\@bin);
      }
      else{
         $bel[$bestplace]+=$_;
         push(  @{$out[$bestplace]}    , $_);
      }
   }
   my $ret=0;
   # count all rests but the last one
   for($i=0;$i<$#out;$i++) {
      $ret+=($max-$bel[$i]);
   }
   @{$target} = @out;
   return $ret;
}

# Parms: bin size (int), input array (arr reference), output array (arr reference)
# Returns: wasted space (int)
sub bp_firstfit {
   my $max=$_[0];
   my @in = @{$_[1]};
   my $target = $_[2];
   my @out;
   my @bel;

   piece: foreach my $obj (@in) {
      # first fit, use the first bin with enough free space
      #       print "F: bin$i: $obj, @{$names{$obj}}\n";
      for($i=0;$i<=$#out;$i++) {
         my $newsize=($bel[$i]+$obj);
#         print "bel[i]: $bel[$i], new?: $newsize to max: $max\n";
         if( $newsize <= $max ) {
#            print "F: bin$i: $bel[$i]+$obj=$newsize\n";
            #fits here
            $bel[$i]=$newsize;
            push(  @{$out[$i]} , $obj);
            next piece; # break
         }
      }
      # neues Bin
      my @bin;
      $bel[$i]=$obj;
#      print "N: bin$i: $bel[$i]=$obj\n";
      push(@bin, $obj);
      push(@out,\@bin);
   }
   my $ret=0;
   # sum up all rests except of the one from the last bin
   for($i=0;$i<$#out;$i++) {
#           print "hm, bel $i ist :".$bel[$i]." und res:".($max-$bel[$i])."\n";
      $ret+=($max-$bel[$i]);
   }
   @{$target} = @out;
#      print "wtf, ".join(",", @{$out[0]})."\n";
   return $ret;
}