Lehelt lugemise koodi näide

Allikas: Lambda
#!/usr/local/bin/python

import sys
import os
import libxml2
import libxslt
from types import *

# these paths must be set 

startPage="/home/tanel/Ms/Semriik/index.html"
extractorPath="/home/tanel/Ms/Semriik/extractrdfa.xsl"
outfile="/home/tanel/Ms/Semriik/data.xml"

# debug will give extra printout
debugFlag=0

    
def main():    
    #print 'content-type: text/html\n'        
    
    fulldata=[]
    
    # -- read and parse extractor stylesheets --

    styleDoc=libxml2.parseFile(extractorPath)
    style=libxslt.parseStylesheetDoc(styleDoc)
    
    # -- read start page triples --        
    
    url=startPage
    filedoc=libxml2.htmlParseFile(url,None)
    appres=style.applyStylesheet(filedoc, {'filepath' : "'"+url+"'"})
    triples=style.saveResultToString(appres)               
    filedata=parseTriplesXml(triples)
    confdata=filedata
    fulldata=fulldata+confdata
    if debugFlag:    
        print "==== start page gave triplets ====\n"
        showTriplets(confdata)     
        
    # -- loop over configuration data

    handledorg=[]
    handledurl=[]
    for triple in confdata[1:]:        
        subject=triple.get('subject')
        organisation=subject
        if not organisation in handledorg:           
            if debugFlag:    
                print "\n==== handling org ",organisation," ====\n"
            handledorg=handledorg+[organisation]
            urllist=tripleSubjectValueList(confdata,subject,'er:infoleht')        
            if debugFlag:    
                print "found urls for org: ",urllist                                
            while urllist:                
                url=urllist[0]                
                if not url in handledurl:
                    if debugFlag:    
                        print "==== starting to process url ===="
                        print url
                    handledurl=handledurl+[url]    
                    filedoc=libxml2.htmlParseFile(url,None)                
                    appres=style.applyStylesheet(filedoc, {'filepath' : "'"+url+"'"})
                    triples=style.saveResultToString(appres)                                     
                    filedata=parseTriplesXml(triples)
                    if debugFlag:    
                        print "--- url gave initial triplets ---"                  
                        showTriplets(filedata)
                    internalurllist=tripleValueList(filedata,'er:infoleht')     
                    if debugFlag:    
                        print 'internal infoleht list: ',internalurllist      

                    # here starts code for automatically adding extra data 
                  
                    subjects=tripleGetSubjects(filedata)
                    if debugFlag:    
                        print 'internal subjects list: ',subjects
                    tmporg=tripleMakeForSubjects(subjects,'er:asutus',organisation)                    
                    department=tripleSubjectFirstValue(filedata,url,'er:osakond')                
                    if department:
                        tmpdepartment=tripleMakeForSubjects(subjects,'er:osakond',department)
                    else:
                        tmpdepartment=[]                      
                    tmptype=tripleMakePropForSubjectsDomain(filedata,subjects,
                                    'er:tyyp','tootaja',('er:amet','er:ametijuhend',))    

                     # here ends code for automatically adding extra data 

                    if debugFlag:    
                        print "--- derived triplets for url ---"                  
                        showTriplets(tmporg)
                        showTriplets(tmpdepartment)
                        showTriplets(tmptype)                        
                    filedata=filedata+tmporg+tmpdepartment+tmptype               
                    if debugFlag:    
                        print "--- final triplets for url ---"                  
                        showTriplets(filedata)                    
                    fulldata=fulldata+filedata
                    urllist=urllist+internalurllist
                urllist=urllist[1:]
            
    style.freeStylesheet()
    if debugFlag:    
        print "\n==== final full data ====\n"       
        showTriplets(fulldata)   
    xmlstr=tripletsToXml(fulldata)
    if debugFlag:    
        print "\n==== final full data as xml string ====\n"       
        print xmlstr
    handler=open(outfile,"w")    
    handler.write(xmlstr)    
  
    

def tripleSubjectWithPredValueSet(triples,predicate,object):
    res=[]
    for el in triples:
        if type(el)==DictType:
            if (el.get('object')==object and
                el.get('predicate')==predicate and
                not el.get('subject') in res):
                res=res+[el.get('subject')]           
    return res
    

def tripleSubjectHasValue(triples,subject,predicate,object):
    for el in triples:
        if type(el)==DictType:
            if (el.get('subject')==subject and
                el.get('predicate')==predicate and
                el.get('object')==object):
                return True
    return False


def tripleSubjectValueList(triples,subject,predicate):
    res=[]
    for el in triples:
        if type(el)==DictType:
            if (el.get('subject')==subject and
                el.get('predicate')==predicate):
                res=res+[el.get('object')]           
    return res
    

def tripleSubjectFirstValue(triples,subject,predicate):    
    for el in triples:
        if type(el)==DictType:
            if (el.get('subject')==subject and
                el.get('predicate')==predicate):
                return el.get('object')
    return ""    


def tripleValueList(triples,predicate):
    res=[]
    for el in triples:
        if type(el)==DictType:
            if el.get('predicate')==predicate:
                res=res+[el.get('object')]           
    return res


def tripleGetSubjects(triples):
    res=[]
    for el in triples:
        if type(el)==DictType:
            tmp=el.get('subject')
            if not tmp in res:
                res=res+[tmp]                               
    return res    


def tripleMakeForSubjects(subjects,predicate,object):
    res=[]
    for el in subjects:
        tmp={'subject':el,'predicate':predicate,'object':object}
        res=res+[tmp]
    return res

# the following code adds derived data for certain kinds of subjects

def tripleMakePropForSubjectsDomain(filedata,subjects,newpred,newvalue,oldpreds):
    res=[]
    #elimsubjects=[]
    # collect objects which already have some newpred as predicate
    #for el in filedata:
    #    if type(el)==DictType:                      
    #        if el.get('predicate')==newpred:
    #            elimsubjects=elimsubjects+[el.get('subject')]
    # loop over all objects
    for el in filedata:
        if type(el)==DictType:                      
            if el.get('predicate') in oldpreds: # and not el in elimsubjects:
                res=res+[{'subject':el.get('subject'), 'predicate':newpred, 'object':newvalue}]
    return res                                  
                

def showTriplets(triplets):  
    for el in triplets:
        if type(el)==DictType:
            print el.get('subject'),el.get('predicate'),el.get('object'),el.get('type')           


def tripletsToXml(triplets):  
    res="""<triplets>"""
    for el in triplets:
        if type(el)==DictType:
            tmp="""
<triplet>
 <subject>%s</subject><predicate>%s</predicate><object>%s</object><type>%s</type>            
</triplet>"""               
            typeval=el.get('type')
            if not typeval:
                typeval='xsd:string'
            tmpres=(tmp % (xmlStrEnc(el.get('subject')),xmlStrEnc(el.get('predicate')),
                           xmlStrEnc(el.get('object')),xmlStrEnc(typeval),))
            res=res+tmpres
    return res+"""
</triplets>"""


def removeElem(el,list):
    if el in list:
        res=[]
        for x in list:
            if x!=el:
                res=res+[x]  
        return res                
    else:
        return list


def parseTriplesXml(xmlstr):
    #print 'parseTriplesXml '
    if not xmlstr:
        return None
    dom=libxml2.parseDoc(xmlstr)
    root=dom.children
    if not root or not root.children:
        return None    
    triple=root.children
    res=[]
    while triple:
        if triple.type=='element':
            resel={}
            el=triple.children            
            while el:
                if el.type=='element':
                    #print str(el)
                    if el.children and el.children.type=='text':
                        #print el.name,el.content
                        resel[el.name]=xmlStrDec(el.content)
                el=el.next
            if resel:    
                res=res+[resel]    
        triple=triple.next          
    return res          



def xmlStrDec(data):
    if not data:
        return ""      
    if data.find('<')>=0:
        data=data.replace('<','<')
    if data.find('>')>=0:
        data=data.replace('&gt','>')
    if data.find('&')>=0:
        data=data.replace('&','&')    
    if data.find(''')>=0:
        data=data.replace(''',"'") 
    if data.find('"')>=0:
        data=data.replace('"','"')    
        
    return data                
  

def xmlStrEnc(data):
    if data.find('&')>=0:
        data=data.replace('&','&')
    if data.find('<')>=0:
        data=data.replace('<','<')
    if data.find('>')>=0:
        data=data.replace('>','>')    
        
    return data

main()