#!/bin/ksh93 # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" # # # rssread - a simple RSS2.0 reader with RSS to XHTML to # plaintext conversion. # # Solaris needs /usr/xpg4/bin/ because the tools in /usr/bin are not POSIX-conformant export PATH=/usr/xpg4/bin:/bin:/usr/bin function printmsg { print -u 2 "$@" } function debugmsg { # printmsg "$@" true } function fatal_error { print -u 2 "${progname}: $@" exit 1 } function cat_http { ( protocol="${1%://*}" path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" host="${path1%%/*}" path="${path1#*/}" port="${host##*:}" # If URL did not contain a port number in the host part then look at the # protocol to get the port number if [ "${port}" = "${host}" ] ; then case "${protocol}" in "http") port=80 ;; *) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; esac else host="${host%:*}" fi printmsg "protocol=${protocol} port=${port} host=${host} path=${path}" # prechecks [ "${protocol}" = "" ] && fatal_error "protocol not set." [ "${port}" = "" ] && fatal_error "port not set." [ "${host}" = "" ] && fatal_error "host not set." [ "${path}" = "" ] && fatal_error "path not set." # open TCP channel exec 3<>"/dev/tcp/${host}/${port}" # send HTTP request request="GET /${path} HTTP/1.0\n" request+="Host: ${host}\n" request+="User-Agent: ksh93/rssread (2007-01-16; $(uname -s -r -p))\n" print "${request}\n" >&3 # collect response and send it to stdout cat <&3 ) } function html_entity_to_ascii { typeset -A entity_cache # Todo: Add more HTML/MathML entities here entity_cache["nbsp"]=" " entity_cache["lt"]="<" entity_cache["gt"]=">" entity_cache["amp"]="&" entity_cache["quot"]="\"" entity_cache["apos"]="'" buf="" while read -r -N 1 c ; do if [ "$c" != "&" ] ; then printf "%s" "${c}" continue fi entity="" while read -r -N 1 c ; do case "$c" in ";") break ;; ~(Eilr)[a-z0-9#]) entity+="$c" continue ;; *) debugmsg "error &${entity}${c}#" print -n "${entity}${c}" entity="" continue 2 ;; esac done value="" if [ "${entity_cache["${entity}"]}" != "" ] ; then debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#" value="${entity_cache["${entity}"]}" else if [ "${entity:0:1}" = "#" ] ; then # decimal literal value="$(printf "\u[$(printf "%x" "${entity:1:8}")]")" elif [[ "${entity:0:7}" = ~(Eilr)[0-9a-f]* ]] ; then # hexadecimal literal value="$(printf "\u[${entity:0:7}]")" else # unknown literal - pass-through value="" fi entity_cache["${entity}"]="${value}" debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#" fi printf "%s" "$value" done } # dumb xhtml handler - no CSS, tables, images, iframes or nested # structures are supported (and we assume that the input is correct # xhtml). The code was written in a trial&&error manner and should be # rewritten to parse xhtml correctly. function handle_html { # we can't use global variables here when multiple callbacks use the same # callback function - but we can use the callback associative array for # variable storage instead nameref callbacks=${1} tag_type="$2" tag_value="$3" case "${tag_type}" in tag_begin) case "${tag_value}" in br*) printf "\n" ;; hr*) printf "\n-------------------------------------\n" ;; pre*) callbacks["html_pre"]=1 ;; p*) printf "\n" ;; esac ;; tag_end) case "${tag_value}" in pre*) callbacks["html_pre"]=0 ;; esac ;; tag_text) if [ ${callbacks["html_pre"]} -eq 1 ] ; then printf "%s" "${tag_value}" else # compress spaces/newlines/tabs/etc. printf "%s" "${tag_value/+([\n\r\t\v[:space:][:blank:]])/ }" fi ;; document_start) callbacks["html_pre"]=0 ;; document_end) ;; esac } function handle_rss { # we can't use global variables here when multiple callbacks use the same # callback function - but we can use the callback associative array for # variable storage instead nameref callbacks=${1} tag_type="$2" tag_value="$3" case "${tag_type}" in tag_begin) case "${tag_value}" in item*) item["title"]="" item["link"]="" item["tag"]="" item["description"]="" ;; esac callbacks["textbuf"]="" ;; tag_end) case "${tag_value}" in item*) # note that each RSS item needs to be converted seperately from RSS to HTML to plain text # to make sure that the state of one RSS item doesn't affect others ( printf $"
#### RSS item: title: %s ####" "${item["title"]}" printf $"
## author: %s" "${item["author"]}" printf $"
## link: %s" "${item["link"]}" printf $"
## date: %s" "${item["pubDate"]}" printf $"
## begin description:" printf $"
%s
" "${item["description"]}" printf $"
## end description
" print # extra newline to make sure the sed pipeline gets flushed ) | html_entity_to_ascii | # convert XML entities (e.g. decode RSS content to HTML code) xml_tok "xhtmltok_cb" | # convert HTML to plain text html_entity_to_ascii # convert HTML entities ;; title*) item["title"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; link*) item["link"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; dc:creator* | author*) item["author"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; dc:date* | pubDate*) item["pubDate"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; description*) item["description"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; esac callbacks["textbuf"]="" ;; tag_text) callbacks["textbuf"]+="${tag_value}" ;; document_start) ;; document_end) ;; esac } function xml_tok { typeset buf="" typeset c="" nameref callbacks=${1} [ ! -z "${callbacks["document_start"]}" ] && ${callbacks["document_start"]} "${1}" "document_start" while read -N 1 c -d '\0'; do isendtag=false if [ "$c" = "<" ] ; then if [ "$buf" != "" ] ; then [ ! -z "${callbacks["tag_text"]}" ] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" buf="" fi read -N 1 c -d '\0' if [ "$c" = "/" ] ; then isendtag=true else buf="$c" fi read -d '>' c buf+="$c" if ${isendtag} ; then [ ! -z "${callbacks["tag_end"]}" ] && ${callbacks["tag_end"]} "${1}" "tag_end" "$buf" else [ ! -z "${callbacks["tag_begin"]}" ] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$buf" # handle tags like
(which are start- and end-tag in one piece) if [[ "${buf}" = ~(Er).*/ ]] ; then [ ! -z "${callbacks["tag_end"]}" ] && ${callbacks["tag_end"]} "${1}" "tag_end" "$buf" fi fi buf="" else buf+="$c" fi done [ ! -z "${callbacks["document_end"]}" ] && ${callbacks["document_start"]} "${1}" "document_end" "exit_success" print # final newline to make filters like "sed" happy } # return the value of LC_MESSAGES needed for subprocesses which # want to run in a different locale/encoding function get_lc_messages { [ "${LC_ALL}" != "" ] && { print "${LC_ALL}" ; return 0 ; } [ "${LC_MESSAGES}" != "" ] && { print "${LC_MESSAGES}" ; return 0 ; } [ "${LANG}" != "" ] && { print "${LANG}" ; return 0 ; } print "C" ; return 0 } function usage { OPTIND=0 getopts -a "${progname}" "${USAGE}" OPT '-?' exit 2 } # make sure we use the ksh93 builtin versions builtin cat builtin printf typeset -A rsstok_cb # callbacks for xml_tok rsstok_cb["tag_begin"]="handle_rss" rsstok_cb["tag_end"]="handle_rss" rsstok_cb["tag_text"]="handle_rss" rsstok_cb["textbuf"]="" typeset -A xhtmltok_cb # callbacks for xml_tok xhtmltok_cb["tag_begin"]="handle_html" xhtmltok_cb["tag_end"]="handle_html" xhtmltok_cb["tag_text"]="handle_html" xhtmltok_cb["textbuf"]="" xhtmltok_cb["html_pre"]=0 typeset -A item typeset -A bookmark_urls # "ramdom" urls for testing bookmark_urls=( ["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=ksh&ie=utf-8&num=100&output=rss" ["ksh93_integration"]="http://www.opensolaris.org/rss/os/project/ksh93-integration/announcements/rss2.xml" ["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss" ["jmcp"]="http://www.jmcp.homeunix.com/roller/rss/jmcp" ["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss" ["planetsun"]="http://www.planetsun.org/rss20.xml" ["planetsolaris"]="http://www.planetsolaris.org/rss20.xml" ["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml" ) progname="${0}" USAGE=$' [-? @(#)\$Id: rssread (Roland Mainz) 2007-06-05 \$ ] [+NAME?rssread - fetch RSS messages and convert them to plain text] [+DESCRIPTION?\brssread\b RSS to plain text converter which fetches RSS streams via HTTP and converts them from RSS to HTML to plain UTF-8 text.] [ url ] [+SEE ALSO?\bksh93\b(1)] ' while getopts -a "${progname}" "${USAGE}" OPT ; do # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" case ${OPT} in *) usage ;; esac done shift ${OPTIND}-1 url="$1" if [ "$url" = "" ] ; then fatal_error $"No url given." fi if [ "${bookmark_urls[${url}]}" != "" ] ; then printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}" url="${bookmark_urls[${url}]}" fi ( # set unicode locale since RSS is encoded in UTF-8 # (and make sure $LC_MESSAGES is set to the parent # process's locale that all error messages are using # the callers locale/encoding) export \ LC_MESSAGES="$(get_lc_messages)" \ LC_MONETARY="en_US.UTF-8" \ LC_NUMERIC="en_US.UTF-8" \ LC_COLLATE="en_US.UTF-8" \ LC_CTYPE="en_US.UTF-8" \ LC_TIME="en_US.UTF-8" \ LANG="en_US.UTF-8" cat_http "$url" | xml_tok "rsstok_cb" ) # | iconv -f "UTF-8" - - #EOF.