cwm_string.py

#! /usr/bin/python 
"""

$Id$

String built-ins for cwm
This started as http://www.w3.org/2000/10/swap/string.py

See cwm.py
"""

import string
import re

from diag import verbosity, progress

import urllib # for hasContent

from term import LightBuiltIn, ReverseFunction, Function, UnknownType
from local_decimal import Decimal

LITERAL_URI_prefix = "data:text/n3;"


STRING_NS_URI = "http://www.w3.org/2000/10/swap/string#"


###############################################################################################
#
#                               S T R I N G   B U I L T - I N s
#
# This should be in a separate module, imported and called once by the user
# to register the code with the store
#
#   Light Built-in classes

class BI_GreaterThan(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return (subj.string > obj.string)

class BI_NotGreaterThan(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return (subj.string <= obj.string)

class BI_LessThan(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return (subj.string < obj.string)

class BI_NotLessThan(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return (subj.string >= obj.string)

class BI_StartsWith(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return subj.string.startswith(obj.string)

class BI_EndsWith(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return subj.string.endswith(obj.string)

# Added, SBP 2001-11:-

class BI_Contains(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return subj.string.find(obj.string) >= 0

class BI_ContainsIgnoringCase(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return subj.string.lower().find(obj.string.lower()) >= 0

class BI_ContainsRoughly(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return normalizeWhitespace(subj.string.lower()).find(normalizeWhitespace(obj.string.lower())) >= 0

class BI_DoesNotContainRoughly(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return normalizeWhitespace(subj.string.lower()).find(normalizeWhitespace(obj.string.lower())) < 0

class BI_DoesNotContain(LightBuiltIn): # Converse of the above
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return subj.string.find(obj.string) < 0

class BI_equalIgnoringCase(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return (subj.string.lower() == obj.string.lower())

class BI_notEqualIgnoringCase(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return (string.lower(subj.string) != string.lower(obj.string))


def normalizeWhitespace(s):
    "Normalize whitespace sequences in a string to single spaces"
    res = ""
    for ch in s:
        if ch in " \t\r\n":
            if res[-1:]!=" ": res = res + " " 
        else:
            res = res + ch
    return res

#  String Constructors - more light built-ins
make_string = unicode

class BI_concat(LightBuiltIn, ReverseFunction):
    def evaluateSubject(self, obj_py):
        if verbosity() > 80: progress("Concat input:"+`obj_py`)
        str = ""
        for x in obj_py:
            if not isString(x): return None # Can't
            str = str + x 
        return str

class BI_concatenation(LightBuiltIn, Function):
    def evaluateObject(self, subj_py):
        if verbosity() > 80: progress("Concatenation input:"+`subj_py`)
        str = ""
        for x in subj_py:
            if not isString(x):
                if type(x) == type(long()) or isinstance(x, Decimal):
                    x = make_string(x)
                else:
                    x = `x`
                if verbosity() > 34: progress("Warning: Coercing to string for concat:"+`x`)
#               return None # Can't
            str = str + x 
        return str

    def evalObj45(self,  subj, queue, bindings, proof, query):
#        raise RuntimeError('I got here!')
        subj_py = list(subj)
        if verbosity() > 80: progress("Concatenation input:"+`subj_py`)
        retVal = []
        for x in subj_py:
            try:
                val = x.value()
                if not isString(val):
                    if type(val) == type(long()) or isinstance(val, Decimal):
                        val = make_string(val)
                    else:
                        val = `val`
                    if verbosity() > 34: progress("Warning: Coercing to string for concat:"+`val`)
                retVal.append(val)
            except UnknownType:
                progress("Warning: Coercing to string for concat:"+`x`)
                retVal.append(x.string)
        return subj.store.newLiteral(''.join(retVal))

class BI_scrape(LightBuiltIn, Function):
    """a built-in for scraping using regexps.
    takes a list of 2 strings; the first is the
    input data, and the second is a regex with one () group.
    Returns the data matched by the () group.

    see also: test/includes/scrape1.n3
    Hmm... negative tests don't seem to work.
    """
    
    def evaluateObject(self, subj_py):
        if verbosity() > 80: progress("scrape input:"+`subj_py`)

        str, pat = subj_py
        patc = re.compile(pat)
        m = patc.search(str)

        if m:
            if verbosity() > 80: progress("scrape matched:"+m.group(1))
            return m.group(1)
        if verbosity() > 80: progress("scrape didn't match")

class BI_replace(LightBuiltIn, Function):
    """A built-in for replacing characters or sub.
    takes a list of 3 strings; the first is the
    input data, the second the old and the third the new string.
    The object is calculated as the rplaced string.
    For example, ("fofof bar", "of", "baz") string:replace "fbazbaz bar".
    """
    
    def evaluateObject(self, subj_py):
        str, old, new = subj_py
        return str.replace(old, new)

class BI_search(LightBuiltIn, Function):
    """a more powerful built-in for scraping using regexps.
    takes a list of 2 strings; the first is the
    input data, and the second is a regex with one or more () group.
    Returns the list of data matched by the () groups.

    see also: test/includes/search.n3
    """
    
    def evaluateObject(self, subj_py):
#        raise Error
        store = self.store
        if verbosity() > 80: progress("search input:"+`subj_py`)

        str, pat = subj_py
        patc = re.compile(pat)
        m = patc.search(str)

        if m:
            if verbosity() > 80: progress("search matched:"+m.group(1))
            return m.groups()
        if verbosity() > 80: progress("search didn't match")

class BI_split(LightBuiltIn, Function):
    """split a string into a list of strings
    takes a list of 2 strings and an integer; the first is the
    input data, and the second is a regex
    see re.split in http://docs.python.org/lib/node46.html

    """
    
    def evaluateObject(self, subj_py):
        store = self.store
        str, pat, q = subj_py
        patc = re.compile(pat)
        return patc.split(str, q)


class BI_tokenize(LightBuiltIn, Function):
    """like split without the max arg
    """
    
    def evaluateObject(self, subj_py):
        store = self.store
        str, pat = subj_py
        patc = re.compile(pat)
        return patc.split(str)


class BI_normalize_space(LightBuiltIn, Function):
    """Returns the value of $arg with whitespace normalized by
    stripping leading and trailing whitespace and replacing sequences
    of one or more than one whitespace character with a single space,
    #x20 -- http://www.w3.org/2006/xpath-functions#normalize-space
    """
    
    def evaluateObject(self, subj_py):
        store = self.store
        return ' '.join(subj_py.split())


class BI_stringToList(LightBuiltIn, Function, ReverseFunction):
    """You need nothing else. Makes a string a list of characters, and visa versa.


    """
    def evaluateObject(self, subj_py):
        print "hello, I'm at it"
        try:
            return [a for a in subj_py]
        except TypeError:
            return None
        
    def evaluateSubject(self, obj_py):
        try:
            return "".join(obj_py)
        except TypeError:
            return None

class BI_format(LightBuiltIn, Function):
    """a built-in for string formatting,
    ala python % or C's sprintf or common-lisp's format
    takes a list; the first item is the format string, and the rest are args.
    see also: test/@@
    """
    
    def evaluateObject(self, subj_py):
        return subj_py[0] % tuple(subj_py[1:])

class BI_matches(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return (re.compile(obj.string).search(subj.string))

class BI_notMatches(LightBuiltIn):
    def eval(self,  subj, obj, queue, bindings, proof, query):
        return (not re.compile(obj.string).search(subj.string))


dataEsc = re.compile(r"[\r<>&]")  # timbl removed \n as can be in data
attrEsc = re.compile(r"[\r<>&'\"\n]")

class BI_xmlEscapeData(LightBuiltIn, Function):
    """Take a unicode string and return it encoded so as to pass in an XML data
    You will need the BI_xmlEscapeAttribute on for attributes, escaping quotes."""
    
    def evaluateObject(self, subj_py):
        return xmlEscape(subj_py, dataEsc)
        
class BI_xmlEscapeAttribute(LightBuiltIn, Function):
    """Take a unicode string and return it encoded so as to pass in an XML data
    You may need stg different for attributes, escaping quotes."""
    
    def evaluateObject(self, subj_py):
        return xmlEscape(subj_py, attrEsc)

def xmlEscape(subj_py, markupChars):
    """Escape a string given a regex of the markup chars to be escaped
    from toXML.py """
    i = 0
    result = ""
    while i < len(subj_py):
        m = markupChars.search(subj_py, i)
        if not m:
            result = result + subj_py[i:]
            break
        j = m.start()
        result = result + subj_py[i:j]
        result = result +  ("&#%d;" % (ord(subj_py[j]),))
        i = j + 1
    return result


class BI_encodeForURI(LightBuiltIn, Function):
    """Take a unicode string and return it encoded so as to pass in an
    URI path segment. See
    http://www.w3.org/TR/2005/CR-xpath-functions-20051103/#func-encode-for-uri"""
    
    def evaluateObject(self, subj_py):
        return urllib.quote(subj_py, "#!~*'()")

class BI_encodeForFragID(LightBuiltIn, Function):
    """Take a unicode string and return it encoded so as to pass in
    a URI grament identifier."""
    
    def evaluateObject(self, subj_py):
        return urllib.quote(subj_py)

class BI_resolve_uri(LightBuiltIn, Function):
    """see http://www.w3.org/2006/xpath-functions#resolve-uri"""
    
    def evaluateObject(self, subj_py):
        import uripath
        there, base = subj_py
        return uripath.join(base, there)


#  Register the string built-ins with the store

def isString(x):
    # in 2.2, evidently we can test for isinstance(types.StringTypes)
    return type(x) is type('') or type(x) is type(u'')

def register(store):
    str = store.symbol(STRING_NS_URI[:-1])
    
    str.internFrag("greaterThan", BI_GreaterThan)
    str.internFrag("notGreaterThan", BI_NotGreaterThan)
    str.internFrag("lessThan", BI_LessThan)
    str.internFrag("notLessThan", BI_NotLessThan)
    str.internFrag("startsWith", BI_StartsWith)
    str.internFrag("endsWith", BI_EndsWith)
    str.internFrag("concat", BI_concat)
    str.internFrag("concatenation", BI_concatenation)
    str.internFrag("scrape", BI_scrape)
    str.internFrag("replace", BI_replace)
    str.internFrag("search", BI_search)
    str.internFrag("split", BI_split)
    str.internFrag("stringToList", BI_stringToList)
    str.internFrag("format", BI_format)
    str.internFrag("matches", BI_matches)
    str.internFrag("notMatches", BI_notMatches)
    str.internFrag("contains", BI_Contains)
    str.internFrag("containsIgnoringCase", BI_ContainsIgnoringCase)
    str.internFrag("containsRoughly", BI_ContainsRoughly)
    str.internFrag("notContainsRoughly", BI_DoesNotContainRoughly)
    str.internFrag("doesNotContain", BI_DoesNotContain)
    str.internFrag("equalIgnoringCase", BI_equalIgnoringCase)
    str.internFrag("notEqualIgnoringCase", BI_notEqualIgnoringCase)
    str.internFrag("xmlEscapeAttribute", BI_xmlEscapeAttribute)
    str.internFrag("xmlEscapeData", BI_xmlEscapeData)
    str.internFrag("encodeForURI", BI_encodeForURI)
    str.internFrag("encodeForFragID", BI_encodeForFragID)

    
    fn = store.symbol("http://www.w3.org/2006/xpath-functions")
    fn.internFrag("resolve-uri", BI_resolve_uri)
    fn.internFrag("tokenize", BI_tokenize)
    fn.internFrag("normalize-space", BI_normalize_space)