#!/usr/bin/python import urllib, re, cgi, string, ftplib, sys from HTMLParser import HTMLParser # Simple Python app to generate an RDF Site Summary for http://www.cix.co.uk/~jimh/weblog/blogger.html # can also ftp the file generated (if required see params comment later). # # Based on a script by Mark Pilgrim: http://diveintomark.org/archives/2002/08/06.html#feeding_zeldman # # Jim Hughes - Email: jim@fineway.cx - Jabber: jimh@amessage.de or jimh@jabber.at class MyHTMLParser(HTMLParser): """Trivial HTML parser class, basically strips out all tags and leaves just textual content, the first bold section gets put into self._title and everything else goes into self._content these can be retrieved with the get_title & get_content() methods""" def reset(self): HTMLParser.reset(self) self._title = "" self._content = "" self._hadtitle = 0 def handle_starttag(self, tag, attrs): if tag == "br": self._content += "\r\n" elif tag == "p": self._content += "\r\n" elif tag == "a": self._content += "<a" for k, v in attrs: self._content += ' ' + k + '="' + cgi.escape(v) + '"' self._content += ">" def handle_endtag(self, tag): if self._hadtitle == 0: if tag == "b": self._hadtitle = 1 self._title = self._content self._content = "" if tag =="a": self._content += "</" + tag + ">" def handle_data(self, data): data = cgi.escape(data) data = data.replace("'", "'") data = data.replace('"', """) self._content += data def handle_entityref(self, name): if name == "amp": self._content += "and" #else: # self._content += "&" + name + ";" def get_title(self): return string.strip(self._title) def get_content(self): return string.strip(self._content) # Trivial function to put an Ascii file onto an ftp site def ftpPut( site, user, pwd, path, srcfile, destfile ): ftp = ftplib.FTP( site ) # connect to host # ftp.set_debuglevel(1) # comment out this line if you're not testing ftp.login( user, pwd ) ftp.cwd( path ) if path == ftp.pwd(): fileA = open( srcfile, "r" ) ftp.storlines("STOR " + destfile, fileA) fileA.close() ftp.quit # # Expected params: # # 1 - command (gen, put or both) # 2 - file name # 3 - ftp user # 4 - ftp password # 5 - ftp directory # # # Generate RSS file # if (sys.argv[1] == "gen") or (sys.argv[1] == "both"): opFile = open( sys.argv[2], "w" ) _pattern = re.compile(r' 
(.*?).posted by [ A-z]* at ..:.. UTC', re.DOTALL) data = urllib.urlopen('http://www.cix.co.uk/~jimh/weblog/blogger.html').read() opFile.write( """ Feet Up! http://www.cix.co.uk/~jimh/weblog/blogger.html Feet Up! en-GB\n""" ) _parser = MyHTMLParser() for description, link in _pattern.findall(data): opFile.write( "\n" ) _parser.reset() _parser.feed(description) opFile.write( "%s\n" % _parser.get_title() ) opFile.write( "%s\n" % _parser.get_content() ) opFile.write( "%s\n" % link ) opFile.write( "\n" ) opFile.write( "\n\n" ) opFile.close() # # FTP RSS file # if (sys.argv[1] == "put") or (sys.argv[1] == "both"): ftpPut( "www.cix.co.uk", sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[2], sys.argv[2] )