#!perl
#
#   Copyright (C) 1998, 1999, 2000, 2001 Loic Dachary
#
#   This program is free software; you can redistribute it and/or modify it
#   under the terms of the GNU General Public License as published by the
#   Free Software Foundation; either version 2, or (at your option) any
#   later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software 
#   Foundation, Inc. 59 Temple Place, Suite 330, Boston, 
#   MA  02111-1307  USA.
#

require 5.6.0;

use strict;
use vars qw($create $catid_max $record_max %path2catid %catid2path 
	    %catid2count %catid2pathid %catid2symlinks
	    %stats $verbose $today $exclude $database $mysql
	    $latin1toutf8 $utf8toother $utf8valid $encoding);
use Carp;
use Cwd;
use POSIX qw(strftime);
use Getopt::Long;
use Text::Iconv;
use Catalog::tools::tools;
use Catalog::tools::cgi;
use MD5;
use DB_File;

sub usage {
    my($message) = @_;

    $message .= "\n" if($message);

    print STDERR 
"${message}usage: $0 [--verbose] 
          [--database name (default dmoz)] [--dbopt options] 
          [--encoding name (default UTF-8)] [--exclude pattern] 
          [--dir dir (default .)] [--memory] [--cache megabytes (default 20M)]
          [--get] [--parse]
          [--load {all|category|entry2category|category2category|
                  dmozrecords|path|newsgroup}]
          content.rdf.u8[.gz] structure.rdf.u8[.gz]

          --verbose        increase the verbosity level
          --database       MySQL database name
          --dbopt          when running MySQL use these options
                           example: --dbopt '--user=foo --password=bar'
          --encoding       encoding of the output files
          --exclude        ignore categories that match the pattern
          --dir            all files are relative to this directory
          --get            mirror dmoz.org dumps using wget
          --load           load MySQL database from files generated by --parse
          --parse          parse dmoz.org dumps and generate MySQL dump files
          --memory         while running --parse do not use DB files to
                           store arrays, use ~200Mb of RAM instead.
          --cache          if --memory is not specified, amount of memory 
                           to devote to the DB files cache, minimum is 20M
Examples:
         Mirror dmoz in current directory
         $0 --get

         Generate MySQL dump files from the dmoz.org dumps
         $0 --encoding ISO-8859-1 --parse content.rdf.u8.gz structure.rdf.u8.gz

         Load the MySQL database from the files generated by --parse
         $0 --load all

         All the above in one pass
         $0 --encoding ISO-8859-1 --get --parse --load all \\
             content.rdf.u8.gz structure.rdf.u8.gz

         All in one, using memory (2 or 3 times faster)
         assuming you have ~200Mb of free RAM
         $0 --encoding ISO-8859-1 --get --parse --load all \\
            --memory content.rdf.u8.gz structure.rdf.u8.gz
";
    exit(1);
}

sub main {
    my($load);
    my($get);
    my($parse);
    my($help);
    my($memory);
    my($cache);

    my($dbopt) = '';
    my($dir) = '.';
    $database = 'dmoz';
    $encoding = 'UTF-8';

    my($getopt);

    $getopt = GetOptions("verbose+" => \$verbose,
			 "load=s" => \$load,
			 "get" => \$get,
			 "parse" => \$parse,
			 "exclude=s" => \$exclude,
			 "database=s" => \$database,
			 "encoding=s" => \$encoding,
			 "dir=s" => \$dir,
			 "memory" => \$memory,
			 "cache" => \$cache,
			 "dbopt=s" => \$dbopt,
			 "help" => \$help);


    my($content, $structure) = @ARGV;

    usage() if(!$getopt || $help);
    usage("specify either --get or --parse or --load") if(!$load && !$parse && !$get);
    usage("missing arguments content.rdf.u8[.gz] structure.rdf.u8[.gz]") if($parse && (!$structure || !$content));

    $dir .= '/' if($dir !~ m:/$:);

    $utf8valid = Text::Iconv->new("UTF-8", "UTF-8");
    $latin1toutf8 = Text::Iconv->new("ISO-8859-1", "UTF-8");
    $utf8toother = Text::Iconv->new("UTF-8", $encoding) if($encoding ne 'UTF-8');

    if($parse && !$memory) {
	my(@vars) = (qw(path2catid catid2path catid2count catid2pathid catid2symlinks));
	$cache =~ s/m$//i;
	$cache = 20 if($cache < 20);
	$DB_HASH->{'cachesize'} = ($cache * 1024 * 1024) / scalar(@vars);
	$DB_HASH->{'nelem'} = 300000;
	my($var);
	foreach $var (@vars) {
	    my($dbfile) = "$dir/$var.db";
	    unlink($dbfile);
	    eval "tie(%$var, 'DB_File', '$dbfile', O_RDWR|O_CREAT, 0777, \$DB_HASH)";
	    croak($@) if($@);
	}
    }

    $mysql = "mysql $dbopt";

    $| = 1;

    if($get) {
	my($mirror) = ($verbose ? "set -x\n" : "") . "
        cd $dir
	wget --timestamping --quiet http://dmoz.org/rdf/structure.rdf.u8.gz
	wget --timestamping --quiet http://dmoz.org/rdf/content.rdf.u8.gz
	wget --timestamping --quiet http://dmoz.org/rdf/terms.rdf.u8
	wget --timestamping --quiet http://dmoz.org/rdf/kt-structure.rdf.u8.gz
	wget --timestamping --quiet http://dmoz.org/rdf/kt-content.rdf.u8.gz
	wget --timestamping --quiet http://dmoz.org/rdf/kt-terms.rdf.u8.gz
	wget --timestamping --quiet http://dmoz.org/rdf/redirect.rdf.u8.gz
	wget --timestamping --quiet http://dmoz.org/rdf/profiles.rdf.u8.gz
	rm -f *.u8.gz.1 *.u8.1
";
	print $mirror if($verbose > 1);
	system($mirror);
    }
    
    if($parse) {
	print STDERR "convert structure from $structure\n"  if($verbose > 1);
	convert('structure', $dir, $structure);
	print STDERR "convert content from $content\n" if($verbose > 1);
	convert('content', $dir, $content);
    }

    if($load) {
	file_load($load, $dir);
    }

    stats();

    my($var);
    foreach $var (qw(path2catid catid2path catid2count catid2pathid catid2symlinks)) {
	my($dbfile) = "$dir/$var.db";
	unlink($dbfile);
    }
}

#
# Convert string to the encoding specified by --encoding
#
sub toencoding {
    my($string) = @_;

    return $string if($encoding eq 'UTF-8');

    my($tmp) = $utf8toother->convert($string);
    return '' if(!defined($tmp));
    return $tmp;
}

#
# Fake conversion to check that the UTF-8 string is not corrupted.
#
sub utf8_valid {
    my($string) = @_;

    return 1 if(!$string || $utf8valid->convert($string));
    warn("$string is not a valid UTF-8 string") if($verbose);
    return 0;
}

#
# Hanlde --load
#
sub file_load {
    my($what, $dir) = @_;

    my(%file2table) = ( 
			(map { ("$_.txt" => "catalog_${_}_dmoz") } (qw(category entry2category category2category path newsgroup))),
			'dmozrecords.txt' => 'dmozrecords',
		       );
    
    $dir = getcwd() . "/$dir" if($dir !~ m|^/|);

    while(my($file, $table) = each(%file2table)) {
	next if($what ne 'all' && "$what.txt" ne $file);
	my($input) = "${dir}$file";
	my($cmd);
	$cmd = "$mysql -e \"delete from $table;\" $database";
	print "$cmd\n" if($verbose);
	system($cmd);
	$cmd = "$mysql -e \"load data infile '$input' ignore into table $table;\" $database";
	print "$cmd\n" if($verbose);
	system($cmd);
	system("echo $table; $mysql -N -e \"select count(*) from $table;\" $database");
	if($table eq 'catalog_category_dmoz') {
	    $cmd = "$mysql $database < ${dir}category_count.sql";
	    print "$cmd\n" if($verbose);
	    system($cmd);
	}
    } 
}

#
# Handle --parse
#
sub convert {
    my($what, $dir, $input) = @_;

    $today = strftime("%Y/%m/%d", localtime());

    $input = "${dir}$input" if($input !~ m:^/:);

    print "$what ";
    my($count) = 0;
    my($threshold) = 10000;
    $input = "gzip -dc '$input' |" if $input =~ /\.gz/;
    open(FROM, "$input") or error("cannot open $input for reading : $!");

    my($mode) = $what eq 'structure' ? '>' : '>>';

    open(CATEGORY, "$mode${dir}category.txt") or error("cannot open category.txt for writing : $!");
    open(PATH, "$mode${dir}path.txt") or error("cannot open path.txt for writing : $!");
    open(ENTRY2CATEGORY, "$mode${dir}entry2category.txt") or error("cannot open entry2category.txt for writing : $!");
    open(CATEGORY2CATEGORY, "$mode${dir}category2category.txt") or error("cannot open category2category.txt for writing : $!");
    if($what eq 'content') {
	open(DMOZRECORDS, ">${dir}dmozrecords.txt") or error("cannot open dmozrecord.txt for writing : $!");
    }
    if($what eq 'structure') {
	open(NEWSGROUP, ">${dir}newsgroup.txt") or error("cannot open newsgroup.txt for writing : $!");
    }
    my @buffer;
    my($accumulate);
    while(<FROM>) {
	if(/^<Topic/) {
	    if($accumulate) {
		handle($what, join("", @buffer));
		@buffer = ();
	    }
	    $accumulate = 1;
	}
	push(@buffer, $_) if $accumulate;
	print "." if(++$count % $threshold == 0);
    }

    handle($what, join("", @buffer));

    #
    # Symlinks are collected at structure pass but only output at
    # content pass so that symlink pointers will find a category
    # even if (because of a bug in the dump) the category was only
    # listed in content and not in structure.
    #
    if($what eq 'content') {
	handle_symlinks();
    }

    close(CATEGORY);
    close(PATH);
    close(ENTRY2CATEGORY);
    close(CATEGORY2CATEGORY);
    close(FROM);
    if($what eq 'content') {
	close(DMOZRECORDS);

	handle_counts($dir);
    }
    if($what eq 'structure') {
	close(NEWSGROUP);
    }
    print "done\n";
}

#
# Clean a string of non-printable chars, spurious tab and newlines,
# and convert escaped chars.
#
sub fixup {
    my($string, $default) = @_;

    #
    # Unfortunately perl-5.6 chokes and core dump if provided
    # with strings that are not valid UTF-8.
    #
    return '' if(!utf8_valid($string));

    use utf8;

    return $default if(!$string || $string =~ /^\s*$/s);

    #
    # Non printable char
    #
    $string =~ s/[[:^print:]]//gs;
    #
    # Those are normaly converted by the XML parser
    #
    $string =~ s/&amp;/&/gs;
    $string =~ s/&lt;/</gs;
    $string =~ s/&gt;/>/gs;
    $string =~ s/&quot;/\"/gs;
    $string =~ s/&apos;/\'/gs;
    #
    # Replace spaces by a white space so that no tabulation or newline
    # shows in the output: they are separators.
    #
    $string =~ s/\s+/ /gs;
    #
    # Trailing spaces are never significant
    #
    $string =~ s/\s*\Z//s;

    return $string;
}

$catid_max = 10000000;
$record_max = 1;

#
# Parse a chunk going from a <Topic> tag to another.
#
sub handle {
    my($what, $buffer) = @_;

    my($path, $catid);
    $buffer =~ m|<Topic r:id=\"(Top\C*?)\">|;
    $path = fixup($1);
    my($isroot) = $path eq 'Top';
    print STDERR "handle: $path\n" if($verbose > 2);
    if($path && !toencoding($path)) {
	warn("cannot encode category path $path, ignored\n") if($verbose);
	return;
    }
    
    if($exclude && $path =~ /$exclude/os) {
	warn("exclude category $path") if($verbose);
	$stats{'Excluded Category'}++;
	return;
    }
    
    my($definition) = 0;
    $catid = $1 if($buffer =~ m|<catid>(\d+)</catid>|s);
    if(exists($path2catid{$path})) {
	if($path2catid{$path} ne $catid) {
	    warn("catid redefined for $path ($path2catid{$path} -> $catid) ignore") if($verbose);
	    $catid = $path2catid{$path};
	}
	warn("multiple instances of $path") if($what eq 'structure' && $verbose);
	$definition = 1;
    }
    if(defined($catid) &&
       exists($catid2path{$catid}) &&
       $catid2path{$catid} ne $path) {
	warn("catid for $path already used by $catid2path{$catid}, ignore") if($verbose);
	$stats{'Category id clash'}++;
	$catid = undef;
    }
    if(!defined($catid)) {
	$catid = $catid_max++;
	$stats{'Category id generated'}++;
	warn("missing <catid> for $path, generate $catid") if($verbose);
    }
    $path2catid{$path} = $catid;
    $catid2path{$catid} = $path;

    if(!$definition) {
	$path =~ m|(\C*)/(\C*)|;
	my($parent, $name) = ($1, $2);
	if($isroot && $what eq 'structure') {
	    $stats{'root'}++;
	    warn("more than one root : $path") if($stats{'root'} > 1 && $verbose);
	}
	$name = fixup($name, 'No name');
	if($isroot) {
	    $catid2pathid{$catid} = $catid;
	} else {
	    my($parent_id) = find_parent_id($parent);
	    $catid2pathid{$catid} = $catid2pathid{$parent_id} . ",$catid";
	    print CATEGORY2CATEGORY toencoding("0\t$today\t$today\t0\t$parent_id\t$catid\t\n");
	}
	my($view_path) = view_path($path);
	my($md5) = MD5->hexhash(toencoding($view_path));
	print PATH toencoding("$view_path\t$md5\t,$catid2pathid{$catid},\t$catid\n");
	print CATEGORY toencoding("$catid\t$today\t$today\t0\t$name\t0\t\n");
	$stats{'Category loaded'}++;
    }

    if($what eq 'structure') {
	my(@symlinks);
	while($buffer =~ m|<symbolic\s+r:resource="(\C*?):Top(\C*?)"/>|sg) {
	    push(@symlinks, $1, $2);
	}
	$catid2symlinks{$catid} = join("\t", @symlinks) if(@symlinks);
	
	while($buffer =~ m|<newsGroup\s+r:resource="(news:\C*?)"/>|sg) {
	    my($url) = $1;
	    my($out) = toencoding("0\t$today\t$today\t$catid\t$url\n");
	    if(!$out) {
		warn("cannot encode newsgroup $url, ignored") if($verbose);
		next;
	    }
	    print NEWSGROUP $out;
	}
    }

    my(@records);
    while($buffer =~ m|<ExternalPage (\C*?)</ExternalPage>|sg) {
	my($record) = $1;
	$record =~ m|about=\"(\C*?)\"|s;
	#
	# URLs are not UTF-8 encoded but 8859-1 encoded, convert to UTF-8
	# for consistency.
	#
	my($url) = $latin1toutf8->convert($1);
	$record =~ m|<d:Title>(\C*)</d:Title>|s;
	my($title) = $1;
	$record =~ m|<d:Description>(.*)</d:Description>|s;
	my($description) = $1;

	push(@records, {
	    'rowid' => $record_max,
	    'url' => fixup($url, 'Nourl'),
	    'title' => fixup($title, 'No title'),
	    'description' => fixup($description, 'No description'),
	});
	$record_max++;
	$stats{'Record loaded'}++;
    }

    my($record_count) = 0;
    my($record);
    foreach $record (@records) {
	my($out) = toencoding("$record->{'rowid'}\t$today\t$today\tactive\t$record->{'url'}\t$record->{'title'}\t$record->{'description'}\n");
	#
	# Skip entries that cannot be encoded
	#
	next if(!$out);
	$record_count++;
	print DMOZRECORDS $out;
	print ENTRY2CATEGORY toencoding("$today\t$today\t0\t$record->{'rowid'}\t$catid\n");
    }

    if($what eq 'content') {
	update_count($catid, $record_count);
    }
}

sub find_parent_id {
    my($path) = @_;

    if(exists($path2catid{$path})) {
	return $path2catid{$path};
    } elsif($path) {
	$path =~ m|(\C*)/(\C*)|;
	my($parent, $name) = ($1, $2);
	my($parent_id) = find_parent_id($parent);
	my($catid) = $catid_max++;
	$path2catid{$path} = $catid;
	$catid2path{$catid} = $path;
	$catid2pathid{$catid} = $catid2pathid{$parent_id} . ",$catid";
	warn("$path automatically generated") if($verbose);
	$stats{'Generated category'}++;
	my($out) = toencoding("$catid\t$today\t$today\t0\t$name\t0\t\n");
	die "cannot be encoded" if(!$out);
	my($view_path) = view_path($path);
	my($md5) = MD5->hexhash(toencoding($view_path));
	print PATH toencoding("$view_path\t$md5\t,$catid2pathid{$catid},\t$catid\n");
	print CATEGORY $out;
	print CATEGORY2CATEGORY toencoding("0\t$today\t$today\t0\t$parent_id\t$catid\t\n");
        return $catid;
    } else {
	croak "root category was not found\n";
    }
}

sub update_count {
    my($catid, $count) = @_;

    foreach $catid (split(',', $catid2pathid{$catid})) {
	$catid2count{$catid} += $count;
    }
}

sub handle_symlinks {
    while(my($catid, $paths) = each(%catid2symlinks)) {
	my(@list) = map { fixup($_) } split("\t", $paths);
	while(@list) {
	    my($name) = shift(@list);
	    my($path) = shift(@list);
	    if(!$path || $path =~ /^\s*$/) {
		warn("symlinks in category $catid2path{$catid} has null path, ignored") if($verbose);
		next;
	    }
	    if(!$name || $name =~ /^\s*$/) {
		warn("symlinks in category $catid2path{$catid} has null name, ignored") if($verbose);
		next;
	    }
	    if(!exists($path2catid{$path})) {
		warn("symlink $path in category $catid2path{$catid} points to the non existent category $path, ignored") if($verbose);
		next;
	    }
	    my($other_catid) = $path2catid{$path};
	    print CATEGORY2CATEGORY toencoding("0\t$today\t$today\tsymlink\t$catid\t$other_catid\t$name\n");
	}
    }
}

sub handle_counts {
    my($dir) = @_;

    open(COUNT, ">${dir}category_count.sql") or die "cannot open ${dir}category_count.sql for writing : $!";
    while(my($catid, $count) = each(%catid2count)) {
	print COUNT "update catalog_category_dmoz set count = $count where rowid = $catid;\n";
    }
    close(COUNT);
}

#
# Convert category path for inclusion in the path table
#
sub view_path {
    my($path) = @_;
    if($path eq 'Top') {
	$path = '/';
    } else {
	$path =~ s/[ \'\"]/_/g;
	$path .= '/';
    }
    return $path;
}

#
# Display statistics
#
sub stats {
    return if(!$verbose);
    print "\nStatistics: \n" if(keys(%stats));
    my($label, $value);
    while(($label, $value) = each(%stats)) {
	print "$label = $value\n";
    }
}

main();
