Lehelt lugemise koodi näide
Allikas: Lambda
#!/usr/local/bin/python import sys import os import libxml2 import libxslt from types import * # these paths must be set startPage="/home/tanel/Ms/Semriik/index.html" extractorPath="/home/tanel/Ms/Semriik/extractrdfa.xsl" outfile="/home/tanel/Ms/Semriik/data.xml" # debug will give extra printout debugFlag=0 def main(): #print 'content-type: text/html\n' fulldata=[] # -- read and parse extractor stylesheets -- styleDoc=libxml2.parseFile(extractorPath) style=libxslt.parseStylesheetDoc(styleDoc) # -- read start page triples -- url=startPage filedoc=libxml2.htmlParseFile(url,None) appres=style.applyStylesheet(filedoc, {'filepath' : "'"+url+"'"}) triples=style.saveResultToString(appres) filedata=parseTriplesXml(triples) confdata=filedata fulldata=fulldata+confdata if debugFlag: print "==== start page gave triplets ====\n" showTriplets(confdata) # -- loop over configuration data handledorg=[] handledurl=[] for triple in confdata[1:]: subject=triple.get('subject') organisation=subject if not organisation in handledorg: if debugFlag: print "\n==== handling org ",organisation," ====\n" handledorg=handledorg+[organisation] urllist=tripleSubjectValueList(confdata,subject,'er:infoleht') if debugFlag: print "found urls for org: ",urllist while urllist: url=urllist[0] if not url in handledurl: if debugFlag: print "==== starting to process url ====" print url handledurl=handledurl+[url] filedoc=libxml2.htmlParseFile(url,None) appres=style.applyStylesheet(filedoc, {'filepath' : "'"+url+"'"}) triples=style.saveResultToString(appres) filedata=parseTriplesXml(triples) if debugFlag: print "--- url gave initial triplets ---" showTriplets(filedata) internalurllist=tripleValueList(filedata,'er:infoleht') if debugFlag: print 'internal infoleht list: ',internalurllist # here starts code for automatically adding extra data subjects=tripleGetSubjects(filedata) if debugFlag: print 'internal subjects list: ',subjects tmporg=tripleMakeForSubjects(subjects,'er:asutus',organisation) department=tripleSubjectFirstValue(filedata,url,'er:osakond') if department: tmpdepartment=tripleMakeForSubjects(subjects,'er:osakond',department) else: tmpdepartment=[] tmptype=tripleMakePropForSubjectsDomain(filedata,subjects, 'er:tyyp','tootaja',('er:amet','er:ametijuhend',)) # here ends code for automatically adding extra data if debugFlag: print "--- derived triplets for url ---" showTriplets(tmporg) showTriplets(tmpdepartment) showTriplets(tmptype) filedata=filedata+tmporg+tmpdepartment+tmptype if debugFlag: print "--- final triplets for url ---" showTriplets(filedata) fulldata=fulldata+filedata urllist=urllist+internalurllist urllist=urllist[1:] style.freeStylesheet() if debugFlag: print "\n==== final full data ====\n" showTriplets(fulldata) xmlstr=tripletsToXml(fulldata) if debugFlag: print "\n==== final full data as xml string ====\n" print xmlstr handler=open(outfile,"w") handler.write(xmlstr) def tripleSubjectWithPredValueSet(triples,predicate,object): res=[] for el in triples: if type(el)==DictType: if (el.get('object')==object and el.get('predicate')==predicate and not el.get('subject') in res): res=res+[el.get('subject')] return res def tripleSubjectHasValue(triples,subject,predicate,object): for el in triples: if type(el)==DictType: if (el.get('subject')==subject and el.get('predicate')==predicate and el.get('object')==object): return True return False def tripleSubjectValueList(triples,subject,predicate): res=[] for el in triples: if type(el)==DictType: if (el.get('subject')==subject and el.get('predicate')==predicate): res=res+[el.get('object')] return res def tripleSubjectFirstValue(triples,subject,predicate): for el in triples: if type(el)==DictType: if (el.get('subject')==subject and el.get('predicate')==predicate): return el.get('object') return "" def tripleValueList(triples,predicate): res=[] for el in triples: if type(el)==DictType: if el.get('predicate')==predicate: res=res+[el.get('object')] return res def tripleGetSubjects(triples): res=[] for el in triples: if type(el)==DictType: tmp=el.get('subject') if not tmp in res: res=res+[tmp] return res def tripleMakeForSubjects(subjects,predicate,object): res=[] for el in subjects: tmp={'subject':el,'predicate':predicate,'object':object} res=res+[tmp] return res # the following code adds derived data for certain kinds of subjects def tripleMakePropForSubjectsDomain(filedata,subjects,newpred,newvalue,oldpreds): res=[] #elimsubjects=[] # collect objects which already have some newpred as predicate #for el in filedata: # if type(el)==DictType: # if el.get('predicate')==newpred: # elimsubjects=elimsubjects+[el.get('subject')] # loop over all objects for el in filedata: if type(el)==DictType: if el.get('predicate') in oldpreds: # and not el in elimsubjects: res=res+[{'subject':el.get('subject'), 'predicate':newpred, 'object':newvalue}] return res def showTriplets(triplets): for el in triplets: if type(el)==DictType: print el.get('subject'),el.get('predicate'),el.get('object'),el.get('type') def tripletsToXml(triplets): res="""<triplets>""" for el in triplets: if type(el)==DictType: tmp=""" <triplet> <subject>%s</subject><predicate>%s</predicate><object>%s</object><type>%s</type> </triplet>""" typeval=el.get('type') if not typeval: typeval='xsd:string' tmpres=(tmp % (xmlStrEnc(el.get('subject')),xmlStrEnc(el.get('predicate')), xmlStrEnc(el.get('object')),xmlStrEnc(typeval),)) res=res+tmpres return res+""" </triplets>""" def removeElem(el,list): if el in list: res=[] for x in list: if x!=el: res=res+[x] return res else: return list def parseTriplesXml(xmlstr): #print 'parseTriplesXml ' if not xmlstr: return None dom=libxml2.parseDoc(xmlstr) root=dom.children if not root or not root.children: return None triple=root.children res=[] while triple: if triple.type=='element': resel={} el=triple.children while el: if el.type=='element': #print str(el) if el.children and el.children.type=='text': #print el.name,el.content resel[el.name]=xmlStrDec(el.content) el=el.next if resel: res=res+[resel] triple=triple.next return res def xmlStrDec(data): if not data: return "" if data.find('<')>=0: data=data.replace('<','<') if data.find('>')>=0: data=data.replace('>','>') if data.find('&')>=0: data=data.replace('&','&') if data.find(''')>=0: data=data.replace(''',"'") if data.find('"')>=0: data=data.replace('"','"') return data def xmlStrEnc(data): if data.find('&')>=0: data=data.replace('&','&') if data.find('<')>=0: data=data.replace('<','<') if data.find('>')>=0: data=data.replace('>','>') return data main()