#! /usr/bin/perl # echos lines but for %W url, fetches page and appends %X abstract and %K keywords (maybe %Y toc) # http://www.xav.com/perl/site/lib/lwpcook.html use LWP::Simple; &initmap(); while (<>) { if (/%W (\S+)$/) { $url = $1; $url = "http://dx.doi.org/$url" if ($url =~ m|^\d+[.]\d+|); # print "url=$url\n"; $doc = &myget($url); # print "doc=" . $doc; $abstract = &getsection($url, "abstract", $doc); $keywords = &getsection($url, "keywords", $doc); # $pages = &getsection($url, "pages", $doc); # unusual to request # $toc = &getsection($url, "toc", $doc); # unusual to request # $authors = &getsection($url, "authors", $doc); # unusual to request print "%W $url\n"; # print "%O size of doc = " . length($doc) . "\n"; # print "$doc"; # print "%A $authors\n" if $authors; # print "%P $pages\n" if $pages; print "%X $abstract\n" if $abstract; print "%K $keywords\n" if $keywords; # print "%Y $toc\n" if $toc; $toc = $keywords = $abstract = $pages = $authors = ""; } else { print; } } sub clean { # cleans whitespace local ($s) = (@_); $s =~ s/^\s*//; $s =~ s/\s*$//; return $s; } sub myget { local ($url) = (@_); local ($doc); return "" if ($url =~ /.pdf$/i); return "" if ($url =~ /introduction.html$/i); if (! &springerUrl($url) ) { $doc = get $url; # print ("doc length=" . length($doc) . "\n"); if (upassocUrl($url)) { $doc =~ s|.*

|

|s; # real content starts here for UPA JUS $doc =~ s| ||gs; } return $doc if $doc; } # print "lynx -source $url\n"; open PIPE, "lynx -source $url |" || die "can't open pipe"; while () { $doc .= $_; } close PIPE; return $doc; } sub initmap { @map = ( # first char is decimal 192 'Agrave', 'Aacute', 'Acirc', 'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil', 'Egrave', 'Eacute', 'Ecirc', 'Euml', 'Igrave', 'Iacute', 'Icirc', 'Iuml', 'Eth', 'Ntilde', 'Ograve', 'Oacute', 'Ocirc', 'Otilde', 'Ouml', 'times', 'Oslash', 'Ugrave', 'Uacute', 'Ucirc', 'Uuml', 'Yacute', 'Thorn', 'szlig', 'agrave', 'aacute', 'acirc', 'atilde', 'auml', 'aring', 'aelig', 'ccedil', 'egrave', 'eacute', 'ecirc', 'euml', 'igrave', 'iacute', 'icirc', 'iuml', 'eth', 'ntilde', 'ograve', 'oacute', 'ocirc', 'otilde', 'ouml', 'divides', 'oslash', 'ugrave', 'uacute', 'ucirc', 'uuml', 'yacute', 'thorn', 'yuml', ); } sub getmap { local ($code) = (@_); return "&" . $map[$code-192] . ';'; } sub mnemonic { # maps é to é local ($text) = (@_); $text =~ s/&#(\d\d\d);/&getmap($1)/ge; return $text; } sub springerUrl { local ($url) = (@_); return $url =~ /10.1007/; } sub elsevierUrl { local ($url) = (@_); return $url =~ /(intcom|ijhcs)/; } sub upassocUrl { local ($url) = (@_); return $url =~ /upassoc.org/; } sub getsection { # url section doc=html-text local ($url, $section, $doc) = (@_); if (&elsevierUrl($url)) { # print "IWC: $section len=" . length($doc) . "\n"; $doc = &getIWCsection($section, $doc); } elsif (&springerUrl($url)) { # print "Springer: $section len=" . length($doc) . "\n"; $doc = &getSpringerSection($section, $doc); } elsif (&upassocUrl($url)) { # print "UPA: $section len=" . length($doc) . "\n"; $doc = &getUPAsection($section, $doc); } else { $doc = getACMsection($section, $doc) unless $section eq 'toc' } $doc =~ s|^\s+||s; # trim leading space $doc =~ s|\s+$||s; # trim trailing space return $doc; } sub getSpringerSection { local ($section, $doc) = (@_); if ($section eq "abstract") { if ($doc =~ m|^.Abstract |s) { $doc =~ s///s; $doc =~ s|.$||s; $doc =~ s/ / /g; $doc =~ s/̶[01];/"/g; # double quotes $doc =~ s/̵[67];/'/g; # single quotes $doc =~ s/–/-/g; # hyphen $doc =~ s/—/--/g; # long dash $doc =~ s|||g; # remove all italics, bold, underline } else { $doc = ""; } } elsif ($section eq "keywords") { if ($doc =~ m|^.*
Keywords |s) { # print "matched keywords\n"; $doc =~ s///s; $doc =~ s|
.$||s; $doc =~ s/ - /, /g; } else { $doc = ""; } } elsif ($section eq "pages") { if ($doc =~ m|^.Pages|s) { # print "matched pages\n"; $doc =~ s///s; $doc =~ s|.$||s; } else { $doc = ""; } } else { $doc = ""; } return $doc; } sub getUPAsection { local ($section, $doc) = (@_); if ($section eq "abstract") { $doc =~ s|^.\sAbstract\s||s; $doc =~ s|]>||gs; # remove all html } elsif ($section eq "keywords") { $doc = ""; } elsif ($section eq "toc") { $doc =~ s|.
Article Contents
||s; $doc =~ s|]>||gs; # remove all html $doc =~ s|^\s+||s; $doc =~ s|\s+$||s; # $doc =~ s|
||g; # $doc =~ s|
||g; } elsif ($section eq "pages") { $doc =~ s|([^<])<|s) { $doc = $1; } elsif ($doc =~ m|[^<]]>([^<])<|) { $doc = $1; } } return $doc; } sub getIWCsection { local ($section, $doc) = (@_); $doc = &fixIWC($doc); if ($section eq "abstract") { if ($doc =~ m|^.
Abstract
|s) { $doc =~ s///s; $doc =~ s|
\s\s.$||s; $doc =~ s|
|\n |g; # insert hcibib para $doc =~ s|||g; } else { $doc = ""; } } elsif ($section eq "keywords") { if ($doc =~ m|^.
Keywords: |s) { $doc =~ s///s; $doc =~ s|
.$||s; } else { $doc = ""; } } elsif ($section eq "toc") { if ($doc =~ m|^.
Article Outline
|s) { $doc =~ s///s; $doc =~ s|\s.$||s; $doc =~ s|]>||g; # remove links $doc =~ s|||g; # remove links $doc =~ s|
|\n|g; # newline on
$doc =~ s|
||g; # clean $doc =~ s|||g; $doc =~ s|||; # drop italics $doc =~ s|^ About the author.$||i; $doc =~ s|^ References $||i; $doc =~ s|^ Further Readings $||i; $doc =~ s|^ Acknowledgements $||i; $doc =~ s|^ Editor's Note $||i; } else { $doc = ""; } } return $doc; } sub fixIWC { local ($doc) = (@_); $doc =~ s/–/-/g; $doc =~ s/’/'/g; $doc =~ s/‘/'/g; # left quote $doc =~ s/“/"/g; # left dquote $doc =~ s/”/"/g; # right dquote $doc =~ s/—/--/g; # long dash return $doc; } sub getACMsection { # section doc=html-text local ($section, $doc) = (@_); #
used for many types of info #
used for abstract and references #
used only for keywords # class="GenTerms" and class="Categories" for ACM terms $doc =~ s/
//g; $doc =~ s/
//g; $doc =~ s/<\/p>//g; $doc =~ s/([^])/
$1<\/P>/g; # save all plain paragraphs $doc =~ s|Note: OCR errors may be found in this Reference List extracted from the full text article. ACM has opted to\sexpose the complete List rather than only correct and linked references.||; $doc =~ s/]>//g; $doc =~ s/<\/a>//g; $doc =~ s/,\s+/, /g; $doc =~ s/
//g; # filter out html $doc =~ s|||g; $doc =~ s|||g; $doc =~ s|||g; $doc =~ s|||g; $doc =~ s|^(\w+)|{sup:$1}|g; # superscripts $doc =~ s|Keywords:\s||; $doc =~ s/<\/P>\s*
/\012 /gi; $doc =~ s/
//gi; $doc =~ s/<\/p>//gi; $doc =~ s/%/%/g; # entity does not display as % in browser local ($match); while ($doc =~ /([^]*)/) { $doc =~ s///; $match .= &clean($1) . "\n"; } return &mnemonic(&clean($match)); }