#!/bin/sh
#
# ungeoify - remove Geocrawler (and hypermail) stuff that clogs up the index
#
# Usage: ungeoify temp-file text/html URL
#
# Actually called as external converter script from htdig...
# external_parsers:	text/html->text/html-internal /path/to/ungeoify
#

# select set of modifications to HTML based on URL...
case "$3" in
http://www.htdig.org/htdig-dev/????/??/att-*|http://www.htdig.org/mail/????/??/att-*)
 # index hypermail messages' HTML attachments as-is
 cat $1
 ;;

http://www.htdig.org/htdig-dev/*|http://www.htdig.org/mail/*)
 # old htdig.org hypermail archives... use received date of message as mod time,
 # don't index message listings themselves (no received date in these),
 # follow links to other messages but don't index the text in&around links,
 # use full subject line for title, instead of truncated one.
 sed -n -e 's|^<!-- received="... \(...\)  *\([0-9]*\) \(..:..:..\) \(....\).*|<meta name="date" content="\4 \1 \2 \3">|p' $1 |
	sed -e 's/Jan/01/; s/Feb/02/; s/Mar/03/; s/Apr/04/; s/May/05/; s/Jun/06/; s/Jul/07/; s/Aug/08/; s/Sep/09/; s/Oct/10/; s/Nov/11/; s/Dec/12/'
 grep '^<!-- received="' $1 > /dev/null || echo '<meta name="robots" content="noindex,follow">'
 sed -e 's|Messages sorted by:|<noindex follow>|' \
	-e 's|<!-- body="start" -->|</noindex>|' \
	-e 's|<!-- body="end" -->|<noindex follow>|' \
	-e '/^<TITLE>/d' \
	-e 's|^<META NAME="Subject" CONTENT="\(.*\)">|<title>\1</title>|' $1
 ;;

http://www.geocrawler.com/archives/*)
 # newer geocrawler archives... use date of message as mod time,
 # don't index message listings themselves (no date field in these),
 # normalize base tags to force start number to 0, which prevents duplicate
 # URLs as listings grow and older messages are pushed to later pages,
 # strip out extra bits we just don't want to index
 sed -n -e 's|^.*>DATE: \(..\)/\(..\)/\(....\)&nbsp;\(..:..:..\)<.*|<meta name="date" content="\3 \1 \2 \4">|p' $1
 grep '>DATE: ../../' $1 > /dev/null || echo '<meta name="robots" content="noindex,follow">'
 sed -n -e 's|^\(<BASE.*="http://www.geocrawler.com/archives/3/882./20../[1-9][0-9]*\)/[0-9]*/\(.*".*\)|\1/0/\2|' \
	-e '1,/^<BODY/p; /^<BASE/p; /^<H3>/,/OSDN Footer/p' $1
 ;;

*)
 # index any other HTML file as-is
 cat $1
 ;;
esac

