#!/usr/bin/perl -w #========================================================== # Split up the DMOZ content file into categories. # Date : 20th June 2003 # Author: Paul Wilson # Copyright Paul Wilson, 2003. All Rights Reserved. #========================================================== use strict; use Getopt::Long; main(); #========================================================== sub main { #---------------------------------------------------------- # Split up the dmoz data. die(qq[$0 must not be called via a browser.]) if ($ENV{REQUEST_METHOD}); # Get our options. my ($flag, $i, $verbose, $rdf, $out, $cat, $s) = (0, 0); GetOptions(q[v|verbose] => \$verbose, q[rdf|data=s] => \$rdf, q[file|out=s] => \$out, q[category|cat=s] => \$cat); # Bad options supplied. return _usage() unless ($rdf and $out and $cat); # Read the data file. local(*FH1, *FH2); my ($fh1, $fh2) = \(*FH1, *FH2); my ($string) = qq||; open $fh1, $rdf or die(qq[Unable to read rdf file '$rdf'. Reason: $!]); while (<$fh1>) { if (index($_, $string) > -1 and ! $flag) { open $fh2, qq[>$out] or die(qq[Can't open out file '$out'. Reason: $!]); $flag = 1; } next if (! $flag); last if (// and (substr($1, 0, length($cat)) ne $cat)); print $fh2 $_; unless ($i) { print qq[\nPreparing to write data to $out\n\n]; sleep(3); print qq[Lines of data written:\n\n]; sleep(3); } print qq[$i ] . (q[ ] x _pos($i)) if (! (++$i % 15_000)); print qq[\n] if (! ($i % 75_000)); } print qq!\n\nExport complete.\n!; } sub _pos { #---------------------------------------------------------- # For formatting. ($_[0] > 0 and $_[0] < 100_000) ? 7 : ($_[0] > 99_999 and $_[0] < 1_000_000) ? 6 : ($_[0] > 999_999 and $_[0] < 10_000_000) ? 5 : ($_[0] > 9_999_999 and $_[0] < 100_000_000) ? 4 : 3; } sub _usage { #---------------------------------------------------------- # Show the usage page. print sprintf(join("",), $0); } __DATA__ RDF Splitter By Paul Wilson. Copyright 2003. All Rights Reserved. ================================================================ Usage: %s --rdf=content.rdf --file=new.out --cat=Top --rdf OR --data This option must contain the full path to the dmoz rdf file. --file OR --out This option must contain the path to a desired output file. --category OR --cat This option specifies the category to extract. --v OR --verbose This option will show verbose output.