#! /usr/bin/perl

# echos lines but for %W url, fetches page and appends %X abstract and %K keywords (maybe %Y toc)

# http://www.xav.com/perl/site/lib/lwpcook.html

use LWP::Simple;

&initmap();

while (<>) {
	if (/%W (\S+)$/) {
		$url = $1;
		$url = "http://dx.doi.org/$url" if ($url =~ m|^\d+[.]\d+|);
		# print "url=$url\n";
		$doc = &myget($url);
		# print "doc=" . $doc;
		$abstract = &getsection($url, "abstract", $doc);
		$keywords = &getsection($url, "keywords", $doc);
		# $pages    = &getsection($url, "pages",    $doc); # unusual to request
		# $toc      = &getsection($url, "toc",      $doc); # unusual to request
		# $authors    = &getsection($url, "authors",    $doc); # unusual to request
		print "%W $url\n";
		# print "%O size of doc = " . length($doc) . "\n";
		# print "$doc";
		# print "%A $authors\n" if $authors;
		# print "%P $pages\n" if $pages;
		print "%X $abstract\n" if $abstract;
		print "%K $keywords\n" if $keywords;
		# print "%Y $toc\n" if $toc;
		$toc = $keywords = $abstract = $pages = $authors = "";
	} else {
		print;
	}
}

sub clean { # cleans whitespace
	local ($s) = (@_);
	$s =~ s/^\s*//;
	$s =~ s/\s*$//;
	return $s;
}

sub myget {
	local ($url) = (@_);
	local ($doc);
	return "" if ($url =~ /.pdf$/i);
	return "" if ($url =~ /introduction.html$/i);
	if (! &springerUrl($url) ) {
		$doc = get $url;
		# print ("doc length=" . length($doc) . "\n");
		if (upassocUrl($url)) {
			$doc =~ s|.*<h1>|<h1>|s; # real content starts here for UPA JUS
			$doc =~ s|||gs;
		}
		return $doc if $doc;
	}
	# print "lynx -source $url\n";
	open PIPE, "lynx -source $url |" || die "can't open pipe";
	while (<PIPE>) {
		$doc .= $_;
	}
	close PIPE;
	return $doc;
}

sub initmap {
@map = ( # first char is decimal 192
	'Agrave', 'Aacute', 'Acirc', 'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil',
	'Egrave', 'Eacute', 'Ecirc', 'Euml', 'Igrave', 'Iacute', 'Icirc', 'Iuml',
	'Eth', 'Ntilde', 'Ograve', 'Oacute', 'Ocirc', 'Otilde', 'Ouml', 'times',
	'Oslash', 'Ugrave', 'Uacute', 'Ucirc', 'Uuml', 'Yacute', 'Thorn', 'szlig',
	'agrave', 'aacute', 'acirc', 'atilde', 'auml', 'aring', 'aelig', 'ccedil',
	'egrave', 'eacute', 'ecirc', 'euml', 'igrave', 'iacute', 'icirc', 'iuml',
	'eth', 'ntilde', 'ograve', 'oacute', 'ocirc', 'otilde', 'ouml', 'divides',
	'oslash', 'ugrave', 'uacute', 'ucirc', 'uuml', 'yacute', 'thorn', 'yuml',
	);
}
sub getmap {
	local ($code) = (@_);
	return "&" . $map[$code-192] . ';';
}
sub mnemonic { # maps &#233; to &eacute;
	local ($text) = (@_);
	$text =~ s/&#(\d\d\d);/&getmap($1)/ge;
	return $text;
}

sub springerUrl {
	local ($url) = (@_);
	return $url =~ /10.1007/;
}
sub elsevierUrl {
	local ($url) = (@_);
	return $url =~ /(intcom|ijhcs)/;
}
sub upassocUrl {
	local ($url) = (@_);
	return $url =~ /upassoc.org/;
}

sub getsection { # url section doc=html-text
	local ($url, $section, $doc) = (@_);
	if (&elsevierUrl($url)) {
		# print "IWC: $section len=" . length($doc) . "\n";
		$doc = &getIWCsection($section, $doc);
	} elsif (&springerUrl($url)) {
		# print "Springer: $section len=" . length($doc) . "\n";
		$doc = &getSpringerSection($section, $doc);
	} elsif (&upassocUrl($url)) {
		# print "UPA: $section len=" . length($doc) . "\n";
		$doc = &getUPAsection($section, $doc);
	} else { 
		$doc = getACMsection($section, $doc) unless $section eq 'toc'
	}
	$doc =~ s|^\s+||s;        # trim leading space
	$doc =~ s|\s+$||s;        # trim trailing space
	return $doc;
}

sub getSpringerSection {
	local ($section, $doc) = (@_);
	if ($section eq "abstract") {
		if ($doc =~ m|^.*<span class="AbstractHeading">Abstract&nbsp;&nbsp;</span>|s) {
			$doc =~ s///s;
			$doc =~ s|</div>.*$||s;
			$doc =~ s/&nbsp;/ /g;
			$doc =~ s/&#822[01];/"/g;     # double quotes
			$doc =~ s/&#821[67];/'/g;     # single quotes
			$doc =~ s/&#8211;/-/g;        # hyphen
			$doc =~ s/&#8212;/--/g;       # long dash
			$doc =~ s|</?[ibu]>||g;       # remove all italics, bold, underline
		} else {
			$doc = "";
		}
	} elsif ($section eq "keywords") {
		if ($doc =~ m|^.*<p class="Keyword"><span class="KeywordHeading">Keywords&nbsp;&nbsp;</span>|s) {
			# print "matched keywords\n";
			$doc =~ s///s;
			$doc =~ s|</p>.*$||s;
			$doc =~ s/&nbsp;-&nbsp;/, /g;
		} else {
			$doc = "";
		}
	} elsif ($section eq "pages") {
		if ($doc =~ m|^.*<td class="labelName">Pages</td><td class="labelValue">|s) {
			# print "matched pages\n";
			$doc =~ s///s;
			$doc =~ s|</td>.*$||s;
		} else {
			$doc = "";
		}
	} else {
		$doc = "";
	}
	return $doc;
}

sub getUPAsection {
	local ($section, $doc) = (@_);
	if ($section eq "abstract") {
		$doc =~ s|^.*<h[23]>\s*Abstract\s*</h[23]>||s;
		$doc =~ s|<h\d.*$||s;
		$doc =~ s|<[^>]*>||gs; # remove all html
	} elsif ($section eq "keywords") {
		$doc = "";
	} elsif ($section eq "toc") {
		$doc =~ s|.*<h2>Article Contents</h2>||s;
		$doc =~ s|<h2.*$||s;
		$doc =~ s|<[^>]*>||gs; # remove all html
		$doc =~ s|^\s+||s;
		$doc =~ s|\s+$||s;
		# $doc =~ s|<ul>||g;
		# $doc =~ s|</ul>||g;
	} elsif ($section eq "pages") {
		$doc =~ s|<h2.*$||;
		if ($doc =~ m|pp[.]\s*(\d+)\s*-\s*(\d+)|s) {
			$doc = "$1-$2";
		}
	} elsif ($section eq "authors") {
		$doc =~ s|<h[23].*$||s;
		if ($doc =~ m|authors.html">([^<]*)<|s) {
			$doc = $1;
		} elsif ($doc =~ m|[^<]*<p[^>]*>([^<]*)<|) {
			$doc = $1;
		}
	}
	return $doc;
}

sub getIWCsection {
	local ($section, $doc) = (@_);
	$doc = &fixIWC($doc);
	if ($section eq "abstract") {
		if ($doc =~ m|^.*<h3 class="h3">Abstract</h3>|s) {
			$doc =~ s///s;
			$doc =~ s|</p>\s*</div>\s*<!-- articleText -->.*$||s;
			$doc =~ s|</p><p>|\n   |g;  # insert hcibib para
			$doc =~ s|</?p>||g;
		} else {
			$doc = "";
		}
	} elsif ($section eq "keywords") {
		if ($doc =~ m|^.*<p><strong>Keywords: </strong>|s) {
			$doc =~ s///s;
			$doc =~ s|</p> </div><!-- articleText -->.*$||s;
		} else {
			$doc = "";
		}
	} elsif ($section eq "toc") {
		if ($doc =~ m|^.*<h3 class="h3">Article Outline</h3>|s) {
			$doc =~ s///s;
			$doc =~ s|\s*</div><!-- articleText -->.*$||s;
			$doc =~ s|<a/?[^>]*>||g; # remove links
			$doc =~ s|</a>||g;       # remove links
			$doc =~ s|<dt> *|\n|g;   # newline on <dt>
			$doc =~ s|</dt>||g;      # clean
			$doc =~ s|</?dl>||g;
			$doc =~ s|</?i>||;       # drop italics
			$doc =~ s|^ *About the author.*$||i;
			$doc =~ s|^ *References *$||i;
			$doc =~ s|^ *Further Readings *$||i;
			$doc =~ s|^ *Acknowledgements *$||i;
			$doc =~ s|^ *Editor's Note *$||i;
		} else {
			$doc = "";
		}
	}
	return $doc;
}

sub fixIWC {
	local ($doc) = (@_);
	$doc =~ s/&#x2013;/-/g;
	$doc =~ s/&#x2019;/'/g;
	$doc =~ s/&#x2018;/'/g; # left quote
	$doc =~ s/&#x201c;/"/g; # left dquote
	$doc =~ s/&#x201d;/"/g; # right dquote
	$doc =~ s/&#x2014;/--/g; # long dash
	return $doc;
}

sub getACMsection { # section doc=html-text
	local ($section, $doc) = (@_);
	# <div class="abstract"> used for many types of info
	# <p class="abstract"> used for abstract and references
	# <p class="keywords"> used only for keywords
	# class="GenTerms" and class="Categories" for ACM terms
	$doc =~ s/<p class="$section">//g;
	$doc =~ s/<p>//g;
	$doc =~ s/<\/p>//g;
	$doc =~ s/([^]*)/<P>$1<\/P>/g; # save all plain paragraphs
	$doc =~ s|<font color="Red">Note: OCR errors may be found in this Reference List extracted from the full text article.  ACM has opted to\s*expose the complete List rather than only correct and linked references.</font>||;
	$doc =~ s/<a [^>]*>//g;
	$doc =~ s/<\/a>//g;
	$doc =~ s/,\s+/, /g;
	$doc =~ s/<BR>//g;
	# filter out html
	$doc =~ s|<i>||g; $doc =~ s|</i>||g;
	$doc =~ s|<b>||g; $doc =~ s|</b>||g;
	$doc =~ s|<sup>(\w+)</sup>|{sup:$1}|g; # superscripts
	$doc =~ s|<SPAN class=heading><A NAME="Keywords">Keywords:</A></span>\s*||;
	$doc =~ s/<\/P>\s*<p>/\012   /gi;
	$doc =~ s/<p>//gi;
	$doc =~ s/<\/p>//gi;
	$doc =~ s/&percnt;/%/g; # entity does not display as % in browser

	local ($match);
	while ($doc =~ /([^]*)/) {
		$doc =~ s///;
		$match .= &clean($1) . "\n";
	}
	return &mnemonic(&clean($match));
}
