#!/usr/bin/env python
# -*- coding: utf-8 -*-

import HTMLParser
import sys, re, urllib

def simplify(text, remove = []):
    parser = WikipediaParser()
    parser.feed(text)
    return postprocess(parser.text, remove)

def postprocess(text, remove = []):
    remove.extend([('Fußnoten', 2), ('Literatur', 2), ('Quellen', 2),
                   ('Siehe auch', 2), ('Weblinks', 2)])
    text = re.sub('<div><a href="/wiki/Bild:Hauptartikel.svg" .*?</div>', '',
                  text)
    text = re.sub('<p>(<i>)?[Ss]iehe( auch)?:.*?</p>', '', text)
    text = re.sub('<a href=[^>]*>(.*?)</a>', r'\1', text)
    text = re.sub('<area href=[^>]*>(.*?)</area>', r'\1', text)
    for name, level in remove:
        text = removeSection(text, name, level)
    return text

def removeSection(text, name, level):
    match = re.search('<p><a name="[^"]*" id="[^"]*"></a></p>\n<h%d> ?'
                      '<span class="mw-headline">%s</span></h%d>' %
                      (level, name, level), text)
    if match:
        pos1 = match.start()
        end = match.end()
        pos2 = re.search('(<p><a name="[^"]*" id="[^"]*"></a></p>\n<h%d> ?'
                         '<span class="mw-headline">[^<]*</span></h%d>)|'
                         '(\n<!-- \nPre-expand include size)' %
                         (level, level), text[end:]).start() + end
        return text[:pos1] + text[pos2:]
    return text

class WikipediaParser(HTMLParser.HTMLParser):
    def reset(self):
        HTMLParser.HTMLParser.reset(self)
        self.text = ''
        self.depth = 0
        self.untilDepth = -1
        self.skipList = ['siteSub', 'contentSub', 'toc', 'printfooter',
                         'column-one', 'footer', 'editsection', 'jump-to-nav',
                         'thumb tleft', 'thumb tright', 'floatleft',
                         'floatright', 'metadata', 'catlinks', 'Vorlage_',
                         'NavFrame', 'NavEnd', 'coordinates', 'artikelstadium',
                         '_ref']

    def handle_all(self, text):
        if self.untilDepth == -1 or self.depth < self.untilDepth:
            self.text += text

    def handle_starttag(self, tag, attrs):
        d = dict(attrs)
        if self.untilDepth == -1 or self.depth < self.untilDepth:
            if tag == 'script':
                self.untilDepth = self.depth
            elif len(attrs) >= 1:
                for skip in self.skipList:
                    if (d.get("id", d.get("class", "")).startswith(skip) or
                        d.get("style", "").startswith("float:right")):
                        self.untilDepth = self.depth
                        continue
        self.handle_all(self.get_starttag_text())
        self.depth += 1

    def handle_startendtag(self, tag, attrs):
        self.handle_all(self.get_starttag_text())

    def handle_data(self, data):
        self.handle_all(data)
        
    def handle_endtag(self, tag):
        self.depth -= 1
        self.handle_all('</%s>' % tag)
        if self.depth <= self.untilDepth:
            self.untilDepth = -1

    def handle_charref(self, name):
        self.handle_all('&#%s;' % name)

    def handle_entityref(self, name):
        self.handle_all('&%s;' % name)

    def handle_comment(self, data):
        self.handle_all('<!--%s-->' % data)

    def handle_decl(self, decl):
        self.handle_all('<!%s>' % decl)

    def handle_pi(self, data):
        self.handle_all('<?%s>' % data)

def usage():
    print >> sys.stderr, ('usage %s <name> (<skip-section> <skip-level>)*'
                          % sys.argv[0])
    sys.exit(1)
    
if __name__ == '__main__':
    # Import Psyco if available
    try:
        import psyco
        psyco.full()
    except ImportError:
        pass

    if len(sys.argv) < 2:
        usage()
    name = sys.argv[1]
    skip = sys.argv[2:]
    skipList = []
    while skip:
        try:
            skipList.append((skip.pop(0), int(skip.pop(0))))
        except (ValueError, IndexError):
            usage()

    class AppURLopener(urllib.FancyURLopener):
        version = "WikipediaSimplifier/0.1"

    urllib._urlopener = AppURLopener()
    f = urllib.urlopen('http://de.wikipedia.org/wiki/'+name)
    print simplify(f.read(), skipList)
    f.close()

