#!/usr/local/bin/perl -s
#<plaintext>
# Time-stamp: <95/09/15 02:32:32 rgs>
#
# Usage: htmlify [-quiet -width=n -nohyphen -<cde>header=Hn] file
# "width" is the typical width of a line [default=72]
# if "nohyphen" is not specified, assume that the document contains splits
#     unhyphenated words at syllable boundaries
# "cheader", "dheader", "eheader" are the types of header (i.e. "H1") which
#     should be used for implicit headers formed by "all caps", "dash (-)
#     underlining", or "equal (=) underlining"
# "smtp" is used to indicate that this file may contain headers from a mail
#     or netnews message.
# "dlpattern" is a pattern which can be matched at the beginning of a line to
#     indicate that the line contains a "term" in a "definition list"
#
# This script was written by Robert Stockton (rgs@cs.cmu.edu)
# This falls under the category of "quick hack".  It is unlikely to be 
# especially readable or maintainable, nor can it be considered "industrial
# strength".  However, it has proven useful enough to be worth keeping
# until something better comes along.
#
#If you are reading this via mosaic, it may be interpreted as html --
#it will probably be clearer if you read the "document source" instead....

$* = 1;				# assume newlines in string searches
$/ = "";			# paragraph mode
$| = 1;				# flush immediately
$body = 0;
$pindent = 0;
$newpara = 0;
defined($cheader) || ($cheader = "H2");
defined($dheader) || ($dheader = "H2");
defined($eheader) || ($eheader = "H1");
defined($sheader) || ($sheader = "H1");
$width || ($width = 72);
$smtp || ($smtp = 0);
# $dlpattern = "^[^ ].*:";
$curtag = "TEXT";
$title = "*** No Title ***";
$inbody = 0;

if ($nohyphen) {
  $hstring = "^(.*[^-]-) *$";
} else {
  $hstring = "^(.*[^-])- *$";
}
while (<>) {
    $oldtxt = "";
    while ($_) {
	($txt, $sep, $rest) = split(/^(.+\n\s*-+\s*\n|.+\n\s*\*+\s*\n|.+\n\s*=+\s*\n|[^a-z\n]*[A-Z][^a-z\n]*\n|--+\n|==+\n|\+\++\n|__+\n)/,$_,2);
        if ($sep) {
	    while (($tab = index($sep, "\t")) >= 0) {
		substr($sep, $tab, 1) = (' ' x (8 - ($tab % 8)));
	    }
	    if ($dheader && $sep =~ /^( *)(.*[^ \n]) *\n *(-+) *\n/) {
		if (length($2) == length($3)) {
		    $title = $2;
		    &dotext($oldtxt . $txt);
		    $oldtxt = "";
		    &prbreak("$1<$dheader>$2</$dheader>", *newpara);
		} else {
		    &dotext($oldtxt . $txt . "$1$2\n");
		    $oldtxt = "";
		    &prbreak("<HR>", *newpara);
		}
	    } elsif ($sheader && $sep =~ /^( *)(.*[^ \n]) *\n *(\*+) *\n/) {
		if (length($2) == length($3)) {
		    $title = $2;
		    &dotext($oldtxt . $txt);
		    $oldtxt = "";
		    &prbreak("$1<$sheader>$2</$sheader>", *newpara);
		} else {
		    &dotext($oldtxt . $txt . "$1$2\n");
		    $oldtxt = "";
		    &prbreak("<HR>", *newpara);
		}
	    } elsif ($eheader && $sep =~ /^( *)(.*[^ \n]) *\n *(=+) *\n/) {
		if (length($2) == length($3)) {
		    $title = $2;
		    &dotext($oldtxt . $txt);
		    $oldtxt = "";
		    &prbreak("$1<$eheader>$2</$eheader>", *newpara);
		} else {
		    &dotext($oldtxt . $txt . "$1$2\n");
		    $oldtxt = "";
		    &prbreak("<HR>", *newpara);
		}
	    } elsif ($cheader && $sep =~ /^( *)([^a-z\n]*[A-Z][^a-z\n]*)\n/) {
		$title = $2;
		$h = "$1<$cheader>$2</$cheader>";
		&dotext($oldtxt . $txt);
		$oldtxt = "";
		&prbreak($h, *newpara);
	    } elsif ($sep =~ /^[ \t]*(--+\n|==+\n|\+\++\n|__+\n)$/) {
		&dotext($oldtxt . $txt);
		$oldtxt = "";
		&prbreak("<HR>", *newpara);
	    } else {
		$oldtxt = $oldtxt . $txt . $sep;
	    }
	    $_ = $rest;
	} else {
	    &dotext($oldtxt . $txt);
	    $oldtxt = "";
	    $_ = "";
	}
    }
    $newpara = 1;
}
$finish = $inbody ? "</BODY>\n</HTML>" : "";
&prbreak($finish, *newpara);

exit;

sub dotext {
    local($txt) = pop(@_);
    local($*) = 0;

    $txt =~ s/\&/&amp;/g;
    $txt =~ s/\>/&gt;/g;
    $txt =~ s/\</&lt;/g;

    if ($smtp &&
	$txt =~ /^[^\t \n:]+:.*(\n[ \t].*|\n[^\t \n:]+:.*)*\n*$/)
    {
# One attempt at coming up with clean headers -- a better implementation of
# <DL COMPACT> might allow this.
#	$txt =~ s/\n([^\t \n:]+):/"<DT><B>$1<\/B>:<DD>"/eg;
#	$txt =~ s/^([^\t \n:]+):/"<DL COMPACT>\n<DT><B>$1<\/B>:<DD>"/e;
#	print("$txt\n</DL>\n");
	($txt =~ /subject:[\t ]*(.*)/i) && ($title = $1);
        $txt =~ s/\n*$//g;
	&prline(0, $txt, *newpara, 0, "PRE");
	return;
    }

    split(/\n/, $txt);
    $indent = 9999;
    $maxlen = 0;
    foreach $line (@_) {
	while (($tab = index($line, "\t")) >= 0) {
	    substr($line, $tab, 1) = (' ' x (8 - ($tab % 8)));
	}
	$line =~ /^( *)(.*)/;
	(length($1) < $indent) && ($indent = length($1));
	(length($2) > $maxlen) && ($maxlen = length($2));
    }
    $line = $_[0];
    $line =~ /^( *)/;
    $pindent = length($1) - $indent;
    $text = "";
    $hyphen = 0;
    foreach $line (@_) {
	$line =~ /^( *)(.*)/;
	$lindent = length($1);
	$hyphen && ($line = $text . $2);
	$text = $2;
	if (($pindent != 0) && ($lindent == $pindent)) {
	    $lindent = $indent unless defined($lihack);
	    $newpara = 1;
	}
	$hyphen = 0;
	if ($line =~ /^ *$/ ||
            ($smtp && $line =~ /^(([ :+|]|&gt;)*(&gt;|[:+|])) *$/)) {
	    $newpara = 1;
	} elsif ($dlpattern && $line =~ /$dlpattern/o) {
	    $dlrest = $';
	    &prline($lindent, "<DT>" . ' ' x (length($1)-4) . "$&",
		    *newpara, 0, "DL");
	    &prline(length($&), "<DD>$dlrest", 0, *newpara, "DD");
	    if ($dlrest eq "") {
		$dlhack = 1;
	    }
	} elsif ($smtp && $line =~ /^(([ :+|]|&gt;)*(&gt;|[:+|]))/) {
	    # should be BLOCKQUOTE, but it doesn't nest properly in Mosaic2.1
	    # &prline(length($1), "$'", *newpara, 0, "BLOCKQUOTE");
	    &prline(length($1), "$'", *newpara, 0, "UL");
	} elsif ($line =~ /^( *[\(\[]?[0-9]+[\)\.\]] +)(.*)/) {
	    &prline(length($1), "<LI>" . ' ' x (length($1)-4) . "$2",
		   *newpara, 0, "OL");
	    $lihack = $lindent;
	} elsif ($line =~ /^( *[\[\(]?[*-]+[\)\]]? +)(.*)/) {
	    &prline(length($1), "<LI>" . ' ' x (length($1)-4) . "$2",
		   *newpara, 0, "UL");
	    $lihack = $lindent;
	} elsif (length($line) < ($width - 12)) {
	    if ($line =~ /[\.\?\!\:\-][\)\]\"\']* *$/) {
		&prline($lindent, $line, *newpara);
	    } else {
		&prline($lindent, $line, *newpara, 0, "PRE");
	    }
        } else {
	    $hyphen = ($line =~ /$hstring/o);
	    if ($hyphen) {
		print(STDERR "*** Unhyphenating line: $text\n")
		    unless defined($quiet);
		$text = $1;
	    } else {
		&prline($lindent, $line, *newpara);
	    }
	}
    }
}    

sub prline {
    local($indent, $text, *newpara, $newnewpara, $tag) = @_;

    print("<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n")
	unless $inbody++;

    if ($dlhack) {
        $indstack[$#indstack] = $indent unless $tag eq "DL";
	$dlhack = 0;
    }

    if (!$newpara && defined($lihack) && $indent == $lihack) {
	$indent = $indstack[$#indstack];
    } elsif ($newpara && $indent == $indstack[$#indstack]) {
    } else {
	undef($lihack);
    }

    $tag || ($tag = "TEXT");
    $newnewpara || ($newnewpara = 0);
    while ($#indstack >= 0 && $indstack[$#indstack] > $indent) {
	if ($waspre) {
	    print("</PRE>\n");
	    $waspre = 0;
	    $newpara = 0;
	}
	pop(@indstack);
	$oldtag = pop(@tagstack);
	print("</$oldtag>\n") unless ($oldtag eq "TEXT" || $oldtag eq "DD");
    }

    if ($tag eq "TEXT" && $#tagstack == -1) {
	push(@indstack, $indent);
	push(@tagstack, $tag);
	$newpara = 0;
    }

    if ($tag eq "PRE" ||
	(($tag eq "TEXT") && ($indent > $indstack[$#indstack])))
    {
	$dedent = substr($text, $indstack[$#indstack]);
	if ($waspre++) {
	    print("\n") if $newpara;
	    print("$dedent\n");
	} else {
	    print("<PRE>\n$dedent\n");
	}
	$newpara = $newnewpara;
	return;
    } elsif ($waspre) {
	print("</PRE>\n");
	$waspre = 0;
	$newpara = 0;
    }
	    
    if ($#indstack > -1 && $indstack[$#indstack] == $indent) {
	if ($tag ne $tagstack[$#tagstack] && $tag ne "TEXT" && $tag ne "DD"){
	    $oldtag = pop(@tagstack);
	    print("</$oldtag>\n");
	    print("<P>\n") if $newpara;
	    push(@tagstack, $tag);
	    print("<$tag>\n");
	} else {
	    print("<P>\n") if $newpara;
	}
    } else {
	push(@indstack, $indent);
	push(@tagstack, $tag);
	print("<P>\n") if $newpara;
	print("<$tag>\n") unless $tag eq "DD";
    }
    print("$text\n");
    $newpara = $newnewpara;
}

sub prbreak {
    local($text, *newpara) = @_;

    print("<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n")
	unless $inbody++;

    $dlhack = 0;
    while ($#indstack >= 0) {
	if ($waspre) {
	    print("</PRE>\n");
	    $waspre = 0;
	}
	pop(@indstack);
	$oldtag = pop(@tagstack);
	print("</$oldtag>\n") unless ($oldtag eq "TEXT" || $oldtag eq "DD");
    }

    print("$text\n");
    $newpara = 0;
}
#</plaintext>
