Search form

Perl Hyperlink Relativizer

This was written to strip local domains from hyperlinks in text files, e.g., www.geoffstratton.com/mylink.html will become /mylink.html.

#!/usr/bin/perl -cw
#
# Strip local domains from link urls.
# Each file will be modified in place and a unique backup saved.

use strict;

my $DEBUG = 0;
sub dbg_print {
	if ($DEBUG > 0) { foreach my $str (@_) { print "$str\n"; } }
}

# Trim away leading/trailing whitespace
sub trim {
	my @out = @_;
	for (@out) { s/^\s+//; s/\s+$//; }
	return wantarray ? @out : $out[0];
}

# Test whether a list contains a certain value
sub contains {
	my ($val, @list) = @_;
	return grep { $_ 0eq $val } @list;
}


# Parser class to replace font tags with suitable values
{
	package LinkRelativizer;
	use base 'HTML::Parser';
	use 'URI';

	# Can't figure out how to import these functions from main
	# Alias them instead
	*trim = \&main::trim;
	*dbg_print = \&main::dbg_print;
	*contains = \&main::contains;

	# presentational attributes to remove
	my @CHANGE_ATTR = ('href', 'src');
	my @LOCAL_DOMAINS (
		'put your domains here',
		'another domain',
	);

	my $result = '';	# store processed document in this string

	# Create a text representation of a tag.
	# Uses attrseq for attributes; values in attr but not attrseq won't
	# be copied.  This allows stripping attributes while still saving
	# their value for later processing.
	sub make_text {
	    my ($self, $tagname, $attr, $attrseq) = @_;

	    my $text = "<$tagname";
	    # The / on /> closing marker for strict xhtml tags is not an
	    # attribute but parser catches it as one.  We remedy that here.
	    foreach my $key (grep { $_ ne '/' } @$attrseq)
	    {
	        my $val = $attr->{$key};
	        $text .= " $key=\"$val\"";
	    }
	    $text .= exists $attr->{'/'} ? ' />' : '>';
	    #print "    $text\n";
	}

	# Process start tags.
	sub start {
	    my ($self, $tagname, $attr, $attrseq, $text) = @_;

		foreach my $key (@CHANGE_ATTR) {
			if (not exists $attr->{$key})	{ next; }

			my $link = $attr->{$key};
			my $domain = 'empty';
			if ($link =~ m|^http://([^/]+)|) { $domain = $1; }

			if (not contains ($domain, @LOCAL_DOMAINS)) {
				# this isn't the domain you're looking for...
				next;
			}
			
			$link =~ s{^http://[^/]+(/.*)}{$1}i;
			$attr->{$key} = $link;
			$text = $self->make_text ($tagname, $attr, $attrseq);
		}
		dbg_print ("tag = $text");
	    $result .= $text;
	}

	# Everything else gets spit out verbatim
	sub text {
	    my ($self, $text, $event) = @_;

		#my $foo = trim ($text);
		#if ($foo) { print "event $event: $foo\n" }
	    $result .= $text;
		dbg_print ("text = $text")	if $text =~ /\S/;
	}

	# Return the processed document and clear the storage buffer
	# Must be called before parsing new doc
	sub finish {
	    my $doc = $result;
		# reset package vars
	    $result = '';
	    return $doc;
	}
#    # Use same handler as text for these types
#    *comment     = \&text;
#    *declaration = \&text;
#    *process     = \&text;

};   # end LinkRelativizer


# ------------- Main --------------------

my $save_backup = 1;
my $rename_file = 1;
foreach my $arg (@ARGV) {
	if ($arg eq '-f') {
		$save_backup = 0;
		next;
	} elsif ($arg eq '-r') {
		$rename_file = 0;
		$save_backup = 0;
		next;
	}
	my $file = $arg;
	my $tmpfile = "$file.tmp";
	open (OLD, "<$file") or die "open failed ($file): $!";
	open (NEW, ">$tmpfile") or die "open failed ($tmpfile): $!";

	my @lines = <OLD>;
	my $content = join ('', @lines);

	my $parser = LinkRelativizer->new(
	    #api_version => 3,
	    #default_h => [sub { print shift }, 'text'],
	    default_h => ['text', 'self,text,event'],
	    start_h => ['start', 'self,tagname,attr,attrseq,text'],
	   # end_h => ['end', 'self,tagname,text'],
	);
	#$parser->xml_mode (1);
	#$parser->parse_file ($file);
	$parser->parse ($content);
	$parser->eof;
	$content = $parser->finish;

	#font: 8pt 'MS Comic Sans','Arial',sans-serif;

	print NEW $content;
	close (OLD) or die "close failed ($file): $!";
	close (NEW) or die "close failed ($tmpfile): $!";

	if ($save_backup)	{
	    my $bakfile = "$file.bak";
		my $cnt = 0;
		while (-e $bakfile)     { $bakfile = "$file.bak.$cnt"; $cnt++; }
		rename ($file, $bakfile) or die "move failed ($file -> $bakfile): $!";
		print "saved $file as $bakfile\n"
	}
	if ($rename_file) {
		rename ($tmpfile, $file) or die "move failed ($tmpfile -> $file): $!";
	} else {
	    warn "WARNING: rename disabled\n"; 
	}
}