diff options
author | April Chin <April.Chin@Sun.COM> | 2008-12-27 14:59:38 -0800 |
---|---|---|
committer | April Chin <April.Chin@Sun.COM> | 2008-12-27 14:59:38 -0800 |
commit | 7c2fbfb345896881c631598ee3852ce9ce33fb07 (patch) | |
tree | 4b173b5657508562dfc0aa05f7d056d1e9add505 /usr/src/lib/libshell/common/scripts/rssread.sh | |
parent | 6071ac1de68fed78e1e10052045bbb5f1732a263 (diff) | |
download | illumos-joyent-7c2fbfb345896881c631598ee3852ce9ce33fb07.tar.gz |
PSARC/2008/094 ksh93 Update 1
PSARC/2008/344 ksh93 Integration Update 1 Amendments 1
PSARC/2008/589 Remove /usr/bin/printf from PSARC case 2008 094
6619428 *ksh93* RFE: Update ksh93 in Solaris to ast-ksh.2008-11-04
6788659 RFE: Update libpp in Solaris to ast-open.2008-07-25
6561901 RFE: Add "shcomp" (shell script compiler) + kernel module to exec binary sh code
6599668 RFE: Move consumers of alias.sh over to ksh93
6595183 *ksh93* RFE: Update ksh93-integration demo code
6775901 *ksh93* no C message catalogs are generated for ksh93
6451262 *sleep* RFE: /usr/bin/sleep should support floating-point values
6687139 *ksh93* command substitution, exec, and stdout redirection cause allocation loop
6703761 *ksh93* crashes in script containing uncommon output redirections
6715496 *ksh93* SEGVs on array reinitialization
6713682 *ksh93* Creating a compound variable in a subshell "bleeds through" to the calling subshell
6672350 *ksh93* causes parent shell to die when child shell is suspended
6745015 *ksh93* VARIABLE=`command substitution` assignment is not reliable on OpenSolaris
6710205 *ksh93* problem with command substitution (within back quotes) containing \$'
6737600 *ksh93* exits debugger when user presses ctrl-c
6748645 *ksh93* fc -l -e - is mis-parsed, outputs wrong error message "-e - requires single argument"
6754020 *ksh93* does weird '[' expansion
6753538 *ksh93* umask modification leaks out of a ksh93 subshell
6766246 *ksh93* bug in pattern matching
6763594 *ksh93* executes command after "command" builtin twice on failure
6762665 *ksh93* Difficult-to-reproduce SIGSEGV in ksh93
Diffstat (limited to 'usr/src/lib/libshell/common/scripts/rssread.sh')
-rw-r--r-- | usr/src/lib/libshell/common/scripts/rssread.sh | 554 |
1 files changed, 554 insertions, 0 deletions
diff --git a/usr/src/lib/libshell/common/scripts/rssread.sh b/usr/src/lib/libshell/common/scripts/rssread.sh new file mode 100644 index 0000000000..fea8627178 --- /dev/null +++ b/usr/src/lib/libshell/common/scripts/rssread.sh @@ -0,0 +1,554 @@ +#!/usr/bin/ksh93 + +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# rssread - a simple RSS2.0 reader with RSS to XHTML to +# plaintext conversion. +# + +# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant +export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin + +function printmsg +{ + print -u2 "$*" +} + +function debugmsg +{ +# printmsg "$*" +true +} + +function fatal_error +{ + print -u2 "${progname}: $*" + exit 1 +} + +# parse HTTP return code, cookies etc. +function parse_http_response +{ + nameref response="$1" + typeset h statuscode statusmsg i + + # we use '\r' as additional IFS to filter the final '\r' + IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code> + [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; } + [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; } + response.statuscode="$statuscode" + response.statusmsg="$statusmsg" + + # skip remaining headers + while IFS='' read -r i ; do + [[ "$i" == $'\r' ]] && break + + # strip '\r' at the end + i="${i/~(Er)$'\r'/}" + + case "$i" in + ~(Eli)Content-Type:.*) + response.content_type="${i/~(El).*:[[:blank:]]*/}" + ;; + ~(Eli)Content-Length:[[:blank:]]*[0-9]*) + integer response.content_length="${i/~(El).*:[[:blank:]]*/}" + ;; + ~(Eli)Transfer-Encoding:.*) + response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}" + ;; + esac + done + + return 0 +} + +function cat_http_body +{ + typeset emode="$1" + typeset hexchunksize="0" + integer chunksize=0 + + if [[ "${emode}" == "chunked" ]] ; then + while IFS=$'\r' read hexchunksize && + [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] && + (( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do + dd bs=1 count="${chunksize}" 2>/dev/null + done + else + cat + fi + + return 0 +} + +function cat_http +{ + typeset protocol="${1%://*}" + typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" + + typeset host="${path1%%/*}" + typeset path="${path1#*/}" + typeset port="${host##*:}" + + integer netfd + typeset -C httpresponse # http response + + # If URL did not contain a port number in the host part then look at the + # protocol to get the port number + if [[ "${port}" == "${host}" ]] ; then + case "${protocol}" in + "http") port=80 ;; + *) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; + esac + else + host="${host%:*}" + fi + + printmsg "protocol=${protocol} port=${port} host=${host} path=${path}" + + # prechecks + [[ "${protocol}" == "" ]] && { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; } + [[ "${port}" == "" ]] && { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; } + [[ "${host}" == "" ]] && { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; } + [[ "${path}" == "" ]] && { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; } + + # open TCP channel + redirect {netfd}<>"/dev/tcp/${host}/${port}" + (( $? != 0 )) && { print -u2 -f "%s: Couldn't open %s\n" "$0" "${1}" ; return 1 ; } + + # send HTTP request + request="GET /${path} HTTP/1.1\r\n" + request+="Host: ${host}\r\n" + request+="User-Agent: rssread/ksh93 (2008-10-14; $(uname -s -r -p))\r\n" + request+="Connection: close\r\n" + print -n -- "${request}\r\n" >&${netfd} + + # collect response and send it to stdout + parse_http_response httpresponse <&${netfd} + cat_http_body "${httpresponse.transfer_encoding}" <&${netfd} + + # close connection + redirect {netfd}<&- + + return 0 +} + +function html_entity_to_ascii +{ + typeset buf + typeset entity + typeset c + typeset value + + # Todo: Add more HTML/MathML entities here + # Note we use a static variable (typeset -S) here to make sure we + # don't loose the cache data between calls + typeset -S -A entity_cache=( + # entity to ascii (fixme: add UTF-8 transliterations) + ["nbsp"]=' ' + ["lt"]='<' + ["le"]='<=' + ["gt"]='>' + ["ge"]='>=' + ["amp"]='&' + ["quot"]='"' + ["apos"]="'" + ) + + buf="" + while IFS='' read -r -N 1 c ; do + if [[ "$c" != "&" ]] ; then + print -n -r -- "${c}" + continue + fi + + entity="" + while IFS='' read -r -N 1 c ; do + case "$c" in + ";") + break + ;; + ~(Eilr)[a-z0-9#]) + entity+="$c" + continue + ;; + *) +# debugmsg "error &${entity}${c}#" + + print -n -r -- "${entity}${c}" + entity="" + continue 2 + ;; + esac + done + + value="" + if [[ "${entity_cache["${entity}"]}" != "" ]] ; then +# debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#" + value="${entity_cache["${entity}"]}" + else + if [[ "${entity:0:1}" == "#" ]] ; then + # decimal literal + value="${ printf "\u[${ printf "%x" "${entity:1:8}" ; }]" ; }" + elif [[ "${entity:0:7}" == ~(Eilr)[0-9a-f]* ]] ; then + # hexadecimal literal + value="${ printf "\u[${entity:0:7}]" ; }" + else + # unknown literal - pass-through + value="ENT=|${entity}|" + fi + + entity_cache["${entity}"]="${value}" + +# debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#" + fi + + printf "%s" "${value}" + done + + return 0 +} + +# dumb xhtml handler - no CSS, tables, images, iframes or nested +# structures are supported (and we assume that the input is correct +# xhtml). The code was written in a trial&&error manner and should be +# rewritten to parse xhtml correctly. +function handle_html +{ + # we can't use global variables here when multiple callbacks use the same + # callback function - but we can use the callback associative array for + # variable storage instead + nameref callbacks=${1} + typeset tag_type="$2" + typeset tag_value="$3" + + case "${tag_type}" in + tag_begin) + case "${tag_value}" in + br) printf "\n" ;; + hr) printf "\n-------------------------------------\n" ;; + pre) callbacks["html_pre"]='true' ;; + p) printf "\n" ;; + esac + ;; + + tag_end) + case "${tag_value}" in + pre) callbacks["html_pre"]='false' ;; + esac + ;; + + tag_text) + if ${callbacks["html_pre"]} ; then + printf "%s" "${tag_value}" + else + # compress spaces/newlines/tabs/etc. + printf "%s" "${tag_value//+([\n\r\t\v[:space:][:blank:]])/ }" + fi + ;; + + document_start) + callbacks["html_pre"]='false' + ;; + document_end) ;; + esac + + return 0 +} + +function handle_rss +{ + # we can't use global variables here when multiple callbacks use the same + # callback function - but we can use the callback associative array for + # variable storage instead + nameref callbacks=${1} + typeset tag_type="$2" + typeset tag_value="$3" + + case "${tag_type}" in + tag_begin) + case "${tag_value}" in + item) + item["title"]="" + item["link"]="" + item["tag"]="" + item["description"]="" + ;; + esac + callbacks["textbuf"]="" + ;; + tag_end) + case "${tag_value}" in + item) + # note that each RSS item needs to be converted seperately from RSS to HTML to plain text + # to make sure that the state of one RSS item doesn't affect others + ( + printf $"<br />#### RSS item: title: %s ####" "${item["title"]}" + printf $"<br />## author: %s" "${item["author"]}" + printf $"<br />## link: %s" "${item["link"]}" + printf $"<br />## date: %s" "${item["pubDate"]}" + printf $"<br />## begin description:" + printf $"<br />%s<br />" "${item["description"]}" + printf $"<br />## end description<br />" + print # extra newline to make sure the sed pipeline gets flushed + ) | + html_entity_to_ascii | # convert XML entities (e.g. decode RSS content to HTML code) + xml_tok "xhtmltok_cb" | # convert HTML to plain text + html_entity_to_ascii # convert HTML entities + ;; + title) item["title"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; + link) item["link"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; + dc:creator | author) item["author"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; + dc:date | pubDate) item["pubDate"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; + description) item["description"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; + esac + callbacks["textbuf"]="" + ;; + tag_text) + callbacks["textbuf"]+="${tag_value}" + ;; + document_start) ;; + document_end) ;; + esac + return 0 +} + +function xml_tok +{ + typeset buf="" + typeset namebuf="" + typeset attrbuf="" + typeset c="" + typeset isendtag # bool: true/false + typeset issingletag # bool: true/false (used for tags like "<br />") + nameref callbacks=${1} + + [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" + + while IFS='' read -r -N 1 c ; do + isendtag=false + + if [[ "$c" == "<" ]] ; then + # flush any text content + if [[ "$buf" != "" ]] ; then + [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" + buf="" + fi + + IFS='' read -r -N 1 c + if [[ "$c" == "/" ]] ; then + isendtag=true + else + buf="$c" + fi + IFS='' read -r -d '>' c + buf+="$c" + + # handle comments + if [[ "$buf" == ~(El)!-- ]] ; then + # did we read the comment completely ? + if [[ "$buf" != ~(Elr)!--.*-- ]] ; then + buf+=">" + while [[ "$buf" != ~(Elr)!--.*-- ]] ; do + IFS='' read -r -N 1 c || break + buf+="$c" + done + fi + + [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" + buf="" + continue + fi + + # check if the tag starts and ends at the same time (like "<br />") + if [[ "${buf}" == ~(Er).*/ ]] ; then + issingletag=true + buf="${buf%*/}" + else + issingletag=false + fi + + # check if the tag has attributes (e.g. space after name) + if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then + namebuf="${buf%%~(E)[[:space:][:blank:]].*}" + attrbuf="${buf#~(E).*[[:space:][:blank:]]}" + else + namebuf="$buf" + attrbuf="" + fi + + if ${isendtag} ; then + [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" + else + [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" + + # handle tags like <br/> (which are start- and end-tag in one piece) + if ${issingletag} ; then + [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" + fi + fi + buf="" + else + buf+="$c" + fi + done + + [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" + + print # final newline to make filters like "sed" happy +} + +# return the value of LC_MESSAGES needed for subprocesses which +# want to run in a different locale/encoding +function get_lc_messages +{ + [[ "${LC_ALL}" != "" ]] && { print "${LC_ALL}" ; return 0 ; } + [[ "${LC_MESSAGES}" != "" ]] && { print "${LC_MESSAGES}" ; return 0 ; } + [[ "${LANG}" != "" ]] && { print "${LANG}" ; return 0 ; } + print "C" ; return 0 +} + +function do_rssread +{ + # set unicode locale since RSS is encoded in UTF-8 + # (and make sure $LC_MESSAGES is set to the parent + # process's locale that all error messages are using + # the callers locale/encoding) + export \ + LC_MESSAGES="${ get_lc_messages ; }" \ + LC_MONETARY="en_US.UTF-8" \ + LC_NUMERIC="en_US.UTF-8" \ + LC_COLLATE="en_US.UTF-8" \ + LC_CTYPE="en_US.UTF-8" \ + LC_TIME="en_US.UTF-8" \ + LANG="en_US.UTF-8" + + # need extra newline after cat_http to terminate line with $'\n' + # to make "xml_tok" happy + { cat_http "$1" ; print ; } | + xml_tok "rsstok_cb" + return 0 +} + +function usage +{ + OPTIND=0 + getopts -a "${progname}" "${rssread_usage}" OPT '-?' + exit 2 +} + +# make sure we use the ksh93 builtin versions +builtin basename +builtin cat + +typeset -A rsstok_cb # callbacks for xml_tok +rsstok_cb["tag_begin"]="handle_rss" +rsstok_cb["tag_end"]="handle_rss" +rsstok_cb["tag_text"]="handle_rss" +rsstok_cb["textbuf"]="" + +typeset -A xhtmltok_cb # callbacks for xml_tok +xhtmltok_cb["tag_begin"]="handle_html" +xhtmltok_cb["tag_end"]="handle_html" +xhtmltok_cb["tag_text"]="handle_html" +xhtmltok_cb["textbuf"]="" +xhtmltok_cb["html_pre"]='false' + +typeset -A item + +typeset -A bookmark_urls + +# "ramdom" urls for testing +bookmark_urls=( + ["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=(%22ksh93%22%7C%22ksh+93%22+%7C+%22korn93%22+%7C+%22korn+93%22)&ie=utf-8&num=100&output=rss" + # OpenSolaris.org sites + ["ksh93_integration"]="http://www.opensolaris.org/rss/os/project/ksh93-integration/announcements/rss2.xml" + ["shell"]="http://www.opensolaris.org/rss/os/project/shell/announcements/rss2.xml" + ["systemz"]="http://www.opensolaris.org/rss/os/project/systemz/announcements/rss2.xml" + # some Sun staff/sites + ["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss" + ["bigadmin"]="http://www.sun.com/bigadmin/content/rss/motd.xml" + ["jmcp"]="http://www.jmcp.homeunix.com/roller/jmcp/feed/entries/rss" + ["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss" + ["alanc"]="http://blogs.sun.com/alanc/feed/entries/rss" + ["planetsun"]="http://www.planetsun.org/rss20.xml" + ["planetsolaris"]="http://www.planetsolaris.org/rss20.xml" + ["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml" + ["theregister_uk"]="http://www.theregister.co.uk/headlines.rss" + ["heise"]="http://www.heise.de/newsticker/heise.rdf" + ["slashdot"]="http://rss.slashdot.org/Slashdot/slashdot" +) + +typeset progname="${ basename "${0}" ; }" + +typeset -r rssread_usage=$'+ +[-?\n@(#)\$Id: rssread (Roland Mainz) 2008-11-10 \$\n] +[-author?Roland Mainz <roland.mainz@sun.com>] +[-author?Roland Mainz <roland.mainz@nrubsig.org>] +[+NAME?rssread - fetch RSS messages and convert them to plain text] +[+DESCRIPTION?\brssread\b RSS to plain text converter + which fetches RSS streams via HTTP and converts them from + RSS to HTML to plain text in the current locale/encoding.] +[I:noiconv?Do not convert data from UTF-8 to current locale/encoding.] + +[ url ] + +[+SEE ALSO?\bksh93\b(1), \bshnote\b(1)] +' + +typeset noiconv=false + +while getopts -a "${progname}" "${rssread_usage}" OPT ; do +# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" + case ${OPT} in + I) noiconv=true ;; + +I) noiconv=false ;; + *) usage ;; + esac +done +shift $((OPTIND-1)) + +typeset url="$1" + +if [[ "${url}" == "" ]] ; then + fatal_error $"No url given." +fi + +if [[ "${bookmark_urls[${url}]}" != "" ]] ; then + printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}" + url="${bookmark_urls[${url}]}" +fi + +if ${noiconv} ; then + do_rssread "${url}" +else + do_rssread "${url}" | iconv -f "UTF-8" - - +fi + +exit 0 +#EOF. |