#!/bin/ksh93

#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#

#
# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
# Use is subject to license terms.
#
# ident	"%Z%%M%	%I%	%E% SMI"
#

#
# rssread - a simple RSS2.0 reader with RSS to XHTML to
# plaintext conversion.
#

# Solaris needs /usr/xpg4/bin/ because the tools in /usr/bin are not POSIX-conformant
export PATH=/usr/xpg4/bin:/bin:/usr/bin

function printmsg
{
    print -u 2 "$@"
}

function debugmsg
{
#    printmsg "$@"
true
}

function fatal_error
{
    print -u 2 "${progname}: $@"
    exit 1
}

function cat_http
{
(   
    protocol="${1%://*}"
    path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"

    host="${path1%%/*}"
    path="${path1#*/}"
    port="${host##*:}"

    # If URL did not contain a port number in the host part then look at the
    # protocol to get the port number
    if [ "${port}" = "${host}" ] ; then
        case "${protocol}" in
            "http") port=80 ;;
            *)      port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
        esac
    else
        host="${host%:*}"
    fi
    
    printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
    
    # prechecks
    [ "${protocol}" = "" ] && fatal_error "protocol not set."
    [ "${port}"     = "" ] && fatal_error "port not set."
    [ "${host}"     = "" ] && fatal_error "host not set."
    [ "${path}"     = "" ] && fatal_error "path not set."

    # open TCP channel
    exec 3<>"/dev/tcp/${host}/${port}"

    # send HTTP request    
    request="GET /${path} HTTP/1.0\n"
    request+="Host: ${host}\n"
    request+="User-Agent: ksh93/rssread (2007-01-16; $(uname -s -r -p))\n"
    print "${request}\n" >&3
    
    # collect response and send it to stdout
    cat <&3
)
}

function html_entity_to_ascii
{
    typeset -A entity_cache

    # Todo: Add more HTML/MathML entities here
    entity_cache["nbsp"]=" "
    entity_cache["lt"]="<"
    entity_cache["gt"]=">"
    entity_cache["amp"]="&"
    entity_cache["quot"]="\""
    entity_cache["apos"]="'"
    
    buf=""
    while read -r -N 1 c ; do
        if [ "$c" != "&" ] ; then
            printf "%s" "${c}"
            continue
        fi
        
        entity=""
        while read -r -N 1 c ; do
            case "$c" in
                ";")
                    break
                    ;;
                ~(Eilr)[a-z0-9#])
                    entity+="$c"
                    continue
                    ;;
                *)
                    debugmsg "error &${entity}${c}#"

                    print -n "${entity}${c}"
                    entity=""
                    continue 2
                    ;;
            esac
        done
        
        value=""
        if [ "${entity_cache["${entity}"]}" != "" ] ; then
            debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#"
            value="${entity_cache["${entity}"]}"
        else
            if [ "${entity:0:1}" = "#" ] ; then
                # decimal literal
                value="$(printf "\u[$(printf "%x" "${entity:1:8}")]")"
            elif [[ "${entity:0:7}" = ~(Eilr)[0-9a-f]* ]] ; then
                # hexadecimal literal
                value="$(printf "\u[${entity:0:7}]")"
            else
                # unknown literal - pass-through
                value="<ENT=${entity}>"
            fi

            entity_cache["${entity}"]="${value}"

            debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#"
        fi

        printf "%s" "$value"
    done
}

# dumb xhtml handler - no CSS,  tables, images, iframes or nested
# structures are supported (and we assume that the input is correct
# xhtml). The code was written in a trial&&error manner and should be
# rewritten to parse xhtml correctly.
function handle_html
{
    # we can't use global variables here when multiple callbacks use the same
    # callback function - but we can use the callback associative array for
    # variable storage instead
    nameref callbacks=${1}
    tag_type="$2"
    tag_value="$3"

    case "${tag_type}" in
        tag_begin)
            case "${tag_value}" in
                br*) printf "\n" ;;
                hr*) printf "\n-------------------------------------\n" ;;
                pre*) callbacks["html_pre"]=1 ;;
                p*)  printf "\n" ;;
            esac
            ;;

        tag_end)
            case "${tag_value}" in
                pre*) callbacks["html_pre"]=0 ;;
            esac
            ;;

        tag_text)
            if [ ${callbacks["html_pre"]} -eq 1 ] ; then
                printf "%s" "${tag_value}"
            else
                # compress spaces/newlines/tabs/etc.
                printf "%s" "${tag_value/+([\n\r\t\v[:space:][:blank:]])/ }"
            fi
            ;;

        document_start)
            callbacks["html_pre"]=0
            ;;
        document_end) ;;
    esac
}

function handle_rss
{
    # we can't use global variables here when multiple callbacks use the same
    # callback function - but we can use the callback associative array for
    # variable storage instead
    nameref callbacks=${1}
    tag_type="$2"
    tag_value="$3"

    case "${tag_type}" in
        tag_begin)
            case "${tag_value}" in
                item*)
                    item["title"]=""
                    item["link"]=""
                    item["tag"]=""
                    item["description"]=""
                    ;;
            esac
            callbacks["textbuf"]=""
            ;;
        tag_end)
            case "${tag_value}" in
                item*)
                    # note that each RSS item needs to be converted seperately from RSS to HTML to plain text
                    # to make sure that the state of one RSS item doesn't affect others
                    (
                        printf $"<br />#### RSS item: title: %s ####" "${item["title"]}"
                        printf $"<br />## author: %s" "${item["author"]}"
                        printf $"<br />## link:   %s" "${item["link"]}"
                        printf $"<br />## date:   %s" "${item["pubDate"]}"
                        printf $"<br />## begin description:"
                        printf $"<br />%s<br />" "${item["description"]}"
                        printf $"<br />## end description<br />"
                        print # extra newline to make sure the sed pipeline gets flushed
                    ) | 
                        html_entity_to_ascii |       # convert XML entities (e.g. decode RSS content to HTML code)
                        xml_tok "xhtmltok_cb" |      # convert HTML to plain text
                        html_entity_to_ascii         # convert HTML entities
                    ;;
                title*)                 item["title"]="${callbacks["textbuf"]}"        ; callbacks["textbuf"]="" ;;
                link*)                  item["link"]="${callbacks["textbuf"]}"         ; callbacks["textbuf"]="" ;;
                dc:creator* | author*)  item["author"]="${callbacks["textbuf"]}"       ; callbacks["textbuf"]="" ;;
                dc:date* | pubDate*)    item["pubDate"]="${callbacks["textbuf"]}"      ; callbacks["textbuf"]="" ;;
                description*)           item["description"]="${callbacks["textbuf"]}"  ; callbacks["textbuf"]="" ;;
            esac
            callbacks["textbuf"]=""
            ;;
        tag_text)
            callbacks["textbuf"]+="${tag_value}"
            ;;
        document_start) ;;
        document_end) ;;
    esac
}

function xml_tok
{
    typeset buf=""
    typeset c=""
    
    nameref callbacks=${1}
    
    [ ! -z "${callbacks["document_start"]}" ] && ${callbacks["document_start"]} "${1}" "document_start"

    while read -N 1 c -d '\0'; do
        isendtag=false
        
        if [ "$c" = "<" ] ; then
            if [ "$buf" != "" ] ; then
                [ ! -z "${callbacks["tag_text"]}" ] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
                buf=""
            fi
            
            read -N 1 c -d '\0'
            if [ "$c" = "/" ] ; then
                isendtag=true
            else
                buf="$c"
            fi
            read -d '>' c
            buf+="$c"
            
            if ${isendtag} ; then
                [ ! -z "${callbacks["tag_end"]}" ] && ${callbacks["tag_end"]} "${1}" "tag_end" "$buf"
            else
                [ ! -z "${callbacks["tag_begin"]}" ] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$buf"

                # handle tags like <br/> (which are start- and end-tag in one piece)
                if [[ "${buf}" = ~(Er).*/ ]] ; then
                    [ ! -z "${callbacks["tag_end"]}" ] && ${callbacks["tag_end"]} "${1}" "tag_end" "$buf"
                fi
            fi
            buf=""
        else
            buf+="$c"
        fi
    done

    [ ! -z "${callbacks["document_end"]}" ] && ${callbacks["document_start"]} "${1}" "document_end" "exit_success"
    
    print # final newline to make filters like "sed" happy
}

# return the value of LC_MESSAGES needed for subprocesses which
# want to run in a different locale/encoding
function get_lc_messages
{
    [ "${LC_ALL}"       != "" ] && { print "${LC_ALL}"      ; return 0 ; }
    [ "${LC_MESSAGES}"  != "" ] && { print "${LC_MESSAGES}" ; return 0 ; }
    [ "${LANG}"         != "" ] && { print "${LANG}"        ; return 0 ; }
    print "C" ; return 0
}

function usage
{
    OPTIND=0
    getopts -a "${progname}" "${USAGE}" OPT '-?'
    exit 2
}

# make sure we use the ksh93 builtin versions
builtin cat
builtin printf

typeset -A rsstok_cb # callbacks for xml_tok
rsstok_cb["tag_begin"]="handle_rss"
rsstok_cb["tag_end"]="handle_rss"
rsstok_cb["tag_text"]="handle_rss"
rsstok_cb["textbuf"]=""

typeset -A xhtmltok_cb # callbacks for xml_tok
xhtmltok_cb["tag_begin"]="handle_html"
xhtmltok_cb["tag_end"]="handle_html"
xhtmltok_cb["tag_text"]="handle_html"
xhtmltok_cb["textbuf"]=""
xhtmltok_cb["html_pre"]=0

typeset -A item

typeset -A bookmark_urls

# "ramdom" urls for testing
bookmark_urls=(
    ["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=ksh&ie=utf-8&num=100&output=rss"
    ["ksh93_integration"]="http://www.opensolaris.org/rss/os/project/ksh93-integration/announcements/rss2.xml"
    ["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss"
    ["jmcp"]="http://www.jmcp.homeunix.com/roller/rss/jmcp"
    ["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss"
    ["planetsun"]="http://www.planetsun.org/rss20.xml"
    ["planetsolaris"]="http://www.planetsolaris.org/rss20.xml"
    ["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml"
)

progname="${0}"

USAGE=$'
[-?
@(#)\$Id: rssread (Roland Mainz) 2007-06-05 \$
]
[+NAME?rssread - fetch RSS messages and convert them to plain text]
[+DESCRIPTION?\brssread\b RSS to plain text converter
        which fetches RSS streams via HTTP and converts them from RSS to HTML to plain UTF-8 text.]

[ url ]

[+SEE ALSO?\bksh93\b(1)]
'

while getopts -a "${progname}" "${USAGE}" OPT ; do 
#    printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
    case ${OPT} in
        *)    usage ;;
    esac
done
shift ${OPTIND}-1

url="$1"

if [ "$url" = "" ] ; then
    fatal_error $"No url given."
fi

if [ "${bookmark_urls[${url}]}" != "" ] ; then
    printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}"
    url="${bookmark_urls[${url}]}"
fi

(
    # set unicode locale since RSS is encoded in UTF-8
    # (and make sure $LC_MESSAGES is set to the parent
    # process's locale that all error messages are using
    # the callers locale/encoding)
    export \
        LC_MESSAGES="$(get_lc_messages)" \
        LC_MONETARY="en_US.UTF-8" \
        LC_NUMERIC="en_US.UTF-8" \
        LC_COLLATE="en_US.UTF-8" \
        LC_CTYPE="en_US.UTF-8" \
        LC_TIME="en_US.UTF-8" \
        LANG="en_US.UTF-8"
        
    cat_http "$url" |
        xml_tok "rsstok_cb"
) # | iconv -f "UTF-8" - -
    
#EOF.