ktouch_tfa_wikipedia-2.1

That script I wrote the other day was quick, and it worked,  but it was kinda kludgey. Of course I didn’t spend much time on it. But I got to thinking wouldn’t it be nice if it didn’t overwrite the old lesson every time it was run. And then I found that there was a pesky long dash that wikipedia likes to use that just isn’t on the keyboard. A rewrite was in order, and I had it done and then moments ago I thought that something should prevent duplicate lessons from being created as well. I guess this project kinda ran away a little bit, but at last here it is ktouch_tfa_ wikipedia-2.1

#! /usr/bin/python
# -*- coding: UTF-8 -*-
# ktouch_tfa_wikipedia-2.1.py by bnr 20091105
# ktouch touch typing lessons scraped from
# today's featured article from wikipedia

import re
import os
import codecs
from xml.dom import minidom
from urllib import FancyURLopener

def main() :
        xmlFile = '/usr/share/kde4/apps/ktouch/wikipedia.ktouch.xml'
        address = "http://en.wikipedia.org/wiki/Main_Page"
        lessons = []
        getTFA(address, lessons)
        makeXmlFile(xmlFile, lessons) 

# get today's featured article
def getTFA(url, lessons) :

        # a custom urlopener with firefox2 user agent
        class MyOpener(FancyURLopener) :
                version = 'Mozilla/5.0 (\
                Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'

        # get the wikipedia mainpage
        myopener = MyOpener()
        tfa = myopener.open(url).read()

        # parse out Todays Featured Article
        tfa = tfa.split('<div id="mp-tfa" style="padding:2px 5px">\n')[1].split('Recently featured:')[0]
        tfa = unicode(tfa, 'utf-8')
        tfa = re.sub('\n','',tfa)       # remove newline characters and
        tfa = re.sub('& #160;',' ',tfa)  # stupid non-bracketed html element
        p = re.compile(r'<.*?>');
        tfa = p.sub('', tfa);   # strip out the html tags
        p = re.compile(u"\u2013")
        tfa = p.sub('-', tfa)   # replace long dash

        # break string into an array of strings ~70 characters long
        l = len(tfa) - 10 # drop ' (more...)' from end of excerpt
        c = 0
        while c <= l :
                oldc = c
                c += 70
                if oldc <= (l - 70) :
                        while tfa[c] != ' ' :
                                c += 1
                        lessons.append(tfa[oldc:c].lstrip())
                else :
                        lessons.append(tfa[oldc:l].lstrip())

# if editing existing xml remove whitespace (or else it grows)
def removeWhitespaceNodes(node) :
        removeList = []
        for child in node.childNodes :
                if child.nodeType == minidom.Node.TEXT_NODE :
                        child.data = re.sub('\n','',child.data.strip())
                        if not child.data.strip()  :
                                removeList.append(child)
                elif child.hasChildNodes() :
                        removeWhitespaceNodes(child)
        for node in removeList :
                node.parentNode.removeChild(node)
                node.unlink()

# create or read and rewrite xml file
def makeXmlFile(xmlFile, lessons) :
        # if xml file exists, read it
        if os.path.exists(xmlFile) :
                xmlObj = open(xmlFile, mode='rb')
                doc = minidom.parse(xmlObj)
                xmlObj.close()
                removeWhitespaceNodes(doc)

                # the parent of the new level
                levelsTag = doc.getElementsByTagName("Levels")[0]
                lastLesson = levelsTag.childNodes[-1].childNodes[1].firstChild.toxml()

        else:   # if xml file does not exist create it
                lastLesson = ''
                doc = minidom.Document()

                # base Node
                ktl = doc.createElement("KTouchLecture")
                doc.appendChild(ktl)

                # first child of base Node
                titleTag = doc.createElement("Title")
                ktl.appendChild(titleTag)
                titleText = doc.createTextNode("Wikipedia Daily Featured Article (auto-generated)")
                titleTag.appendChild(titleText)

                # second child of base Node
                commentTag = doc.createElement("Comment")
                ktl.appendChild(commentTag)
                commentText = doc.createTextNode("KTouch training file generated by python")
                commentTag.appendChild(commentText)

                # third child of base Node
                fontTag = doc.createElement("FontSuggestions")
                ktl.appendChild(fontTag)
                fontText = doc.createTextNode("Monospace")
                fontTag.appendChild(fontText)

                # fourth child of base Node; parent of all Level Nodes
                levelsTag = doc.createElement("Levels")
                ktl.appendChild(levelsTag)

        # prevent the creation of duplicate lessons
        if lastLesson != lessons[0] :
                # child of fourth child
                levelTag = doc.createElement("Level")
                levelsTag.appendChild(levelTag)

                # grandchild of fourth child;
                newChars = doc.createElement("NewCharacters")
                levelTag.appendChild(newChars)
                newCharsText = doc.createTextNode(lessons[0][:36])
                newChars.appendChild(newCharsText)

                # more grandchildren of fourth child; our 'lesson lines'
                for line in lessons :
                        lineTag = doc.createElement("Line")
                        levelTag.appendChild(lineTag)
                        lineText = doc.createTextNode(line)
                        lineTag.appendChild(lineText)

        # output pretty XML and print to file
        output = doc.toprettyxml(indent="  ")
        outFile = codecs.open(xmlFile, mode='w', encoding='utf-8')
        outFile.write(output)
        outFile.close()

main()
Advertisements

3 thoughts on “ktouch_tfa_wikipedia-2.1

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s