#!/usr/bin/python
import urllib, re, cgi, string, ftplib, sys
from HTMLParser import HTMLParser
# Simple Python app to generate an RDF Site Summary for http://www.cix.co.uk/~jimh/weblog/blogger.html
# can also ftp the file generated (if required see params comment later).
#
# Based on a script by Mark Pilgrim: http://diveintomark.org/archives/2002/08/06.html#feeding_zeldman
#
# Jim Hughes - Email: jim@fineway.cx - Jabber: jimh@amessage.de or jimh@jabber.at
class MyHTMLParser(HTMLParser):
"""Trivial HTML parser class, basically strips out all tags
and leaves just textual content, the first bold section gets
put into self._title and everything else goes into self._content
these can be retrieved with the get_title & get_content() methods"""
def reset(self):
HTMLParser.reset(self)
self._title = ""
self._content = ""
self._hadtitle = 0
def handle_starttag(self, tag, attrs):
if tag == "br":
self._content += "\r\n"
elif tag == "p":
self._content += "\r\n"
elif tag == "a":
self._content += "<a"
for k, v in attrs:
self._content += ' ' + k + '="' + cgi.escape(v) + '"'
self._content += ">"
def handle_endtag(self, tag):
if self._hadtitle == 0:
if tag == "b":
self._hadtitle = 1
self._title = self._content
self._content = ""
if tag =="a":
self._content += "</" + tag + ">"
def handle_data(self, data):
data = cgi.escape(data)
data = data.replace("'", "'")
data = data.replace('"', """)
self._content += data
def handle_entityref(self, name):
if name == "amp":
self._content += "and"
#else:
# self._content += "&" + name + ";"
def get_title(self):
return string.strip(self._title)
def get_content(self):
return string.strip(self._content)
# Trivial function to put an Ascii file onto an ftp site
def ftpPut( site, user, pwd, path, srcfile, destfile ):
ftp = ftplib.FTP( site ) # connect to host
# ftp.set_debuglevel(1) # comment out this line if you're not testing
ftp.login( user, pwd )
ftp.cwd( path )
if path == ftp.pwd():
fileA = open( srcfile, "r" )
ftp.storlines("STOR " + destfile, fileA)
fileA.close()
ftp.quit
#
# Expected params:
#
# 1 - command (gen, put or both)
# 2 - file name
# 3 - ftp user
# 4 - ftp password
# 5 - ftp directory
#
#
# Generate RSS file
#
if (sys.argv[1] == "gen") or (sys.argv[1] == "both"):
opFile = open( sys.argv[2], "w" )
_pattern = re.compile(r'
(.*?).posted by [ A-z]* at ..:.. UTC', re.DOTALL)
data = urllib.urlopen('http://www.cix.co.uk/~jimh/weblog/blogger.html').read()
opFile.write( """
Feet Up!
http://www.cix.co.uk/~jimh/weblog/blogger.html
Feet Up!
en-GB\n""" )
_parser = MyHTMLParser()
for description, link in _pattern.findall(data):
opFile.write( "- \n" )
_parser.reset()
_parser.feed(description)
opFile.write( "%s\n" % _parser.get_title() )
opFile.write( "%s\n" % _parser.get_content() )
opFile.write( "%s\n" % link )
opFile.write( "
\n" )
opFile.write( "\n\n" )
opFile.close()
#
# FTP RSS file
#
if (sys.argv[1] == "put") or (sys.argv[1] == "both"):
ftpPut( "www.cix.co.uk", sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[2], sys.argv[2] )