#!/usr/bin/python #Author: Timur Pocheptsov, 17/03/2014. #A script to generate rootcoreteam.h from git stats. from urllib import urlopen from HTMLParser import HTMLParser header1 = r"""#ifndef ROOT_ROOTCoreTeam #define ROOT_ROOTCoreTeam namespace ROOT { namespace ROOTX { //This file is automatically generated with names from http://root.cern.ch/gitstats/authors.html. //The names are sorted in an alphabetical order (sorted by a family name). //Please note the structure: it should be always like this - names as //string literals in an array's initializer //with a terminating 0 - that's what our rootxx.cxx and rootx-cocoa.mm expect. //The array (or its elements actually) has an internal linkage //(it has a definition here, not in rootxx.cxx or rootx-cocoa.mm files. //Please, do not modify this file. const char * gROOTCoreTeam[] = { """ header2 = r""" 0}; } } #endif """ class ParserState: #waiting must be the first, done and errorFound the last. waiting = 0 #initial idle state started = 1 # found, waiting for a header row. parsingHeaderRow = 2 parsingHeaderCell = 3 parsingRow = 4 #we've parsed a header and now reading the table. parsingCell = 5 done = 6 #we stopped after the first table. errorFound = 7 #something bad happend. class GitStatsParser(HTMLParser) : def __init__(self) : HTMLParser.__init__(self) self.table = [] self.state = ParserState.waiting #a couple of aux. functions. def isParsing(self) : return self.state > ParserState.waiting and self.state < ParserState.done def isParsingCell(self) : return self.state == ParserState.parsingCell or self.state == ParserState.parsingHeaderCell #start tags. we are interested only in the first
and #any of , ,
, (in the correct order and place). def handle_starttag(self, tag, attrs): if self.state >= ParserState.done :#done or error. return if tag == 'table' : #we need only one table, when the state is 'waiting'. if self.state == ParserState.waiting : self.state = ParserState.started else : #any other state and means an error. self.state = ParserState.errorFound elif not self.isParsing() :#ignore tags outside of our table. return if tag == 'tr' : if self.state == ParserState.parsingRow : #we can be here only after a valid header row. self.currentColumn = 0 #header is ... elif self.state == ParserState.started : # in every other state is an error. self.state = ParserState.parsingHeaderRow self.nFields = 0 self.authorIndex = -1 else : self.state = ParserState.errorFound elif tag == 'td' : if self.state == ParserState.parsingRow : #valid td can happen only inside a table's row. self.state = ParserState.parsingCell self.cellData = '' else : self.state = ParserState.errorFound elif tag == 'th' : if self.state == ParserState.parsingHeaderRow : self.state = ParserState.parsingHeaderCell self.cellData = '' else : self.state = ParserState.errorFound def handle_endtag(self, tag) : #we need only the first
......
or any of ,
. if not self.isParsing() : #wating, done or error. return if tag == 'table' : if self.state == ParserState.parsingRow : self.state = ParserState.done else : #??? unexpected
!!! self.state = ParserState.errorFound elif tag == 'th' : #th outside of parsingHeader is an error. if self.state == ParserState.parsingHeaderCell : self.state = ParserState.parsingHeaderRow if self.cellData.strip().lower() == 'author' : if self.authorIndex == -1 : self.authorIndex = self.nFields else :#'Author' more than once. self.state = ParserState.errorFound self.nFields += 1 else : self.state = ParserState.errorFound elif tag == 'tr' :# must close only. if self.state == ParserState.parsingRow : if self.nFields != self.currentColumn : self.state = ParserState.errorFound elif self.state == ParserState.parsingHeaderRow : if self.authorIndex == -1 or not self.nFields : self.state = ParserState.errorFound else : self.state = ParserState.parsingRow self.currentColumn = 0 else : self.state = ParserState.errorFound elif tag == 'td' :# must fo after if self.state == ParserState.parsingCell : if self.currentColumn == self.authorIndex : #we got a name!!! name = self.cellData.strip() if name : self.table.append(name) self.state = ParserState.parsingRow self.currentColumn += 1 else : self.state = ParserState.errorFound def handle_data(self, data) : if self.state == ParserState.parsingHeaderCell or self.state == ParserState.parsingCell : self.cellData += data #_____________________________________________________________________ def sort_predicate(x, y) : #Sort names using a family name. name1 = x.split(' ') name2 = y.split(' ') if len(name1) == 2 and len(name2) == 2 : return cmp(name1[1], name2[1]) return cmp(x, y) #_____________________________________________________________________ def generate_rootheader(names) : if names : output = open("rootcoreteam.h", "w") output.write(header1) coreTeam = '' for name in names : if coreTeam : coreTeam += ',\n' coreTeam += '\t"' + name + '"' coreTeam += ',' output.write(coreTeam) output.write(header2) #_____________________________________________________________________ def main() : try : url = "http://root.cern.ch/gitstats/authors.html" html = urlopen(url).read() if html : parser = GitStatsParser() parser.feed(html) if parser.state != ParserState.errorFound and parser.table : names = parser.table #fix some problems: if 'CristinaCristescu' in names : names.remove('CristinaCristescu') names.append('Cristina Cristescu') if 'Stefan Roiser' in names : names.remove('Stefan Roiser') names.append('Valeri Onuchine') names.sort(sort_predicate) generate_rootheader(names) except : pass if __name__ == '__main__' : main()