Source code for biokit.rtools.py2r

# -*- python -*-
#
#  This file is part of biokit software
#
#  Copyright (c) 2014-
#
#  File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: https://github.com/biokit
#
##############################################################################
import sys
from types import *

from easydev import check_param_in_list

import pandas
import numpy


__all__ = ["BoolStr", "ReprStr", "FloatStr", "LongStr", "ComplexStr", "UniStr",
"ByteStr", "SeqStr", "getVec", "NumpyNdarrayStr", "PandasDataFrameStr",
"PandasSerieStr", "OtherStr", "Str4R" ]


if sys.version < '3.0':
    _mystr = _mybytes = lambda s: s
    _in_py3 = False
else:
    from functools import reduce
    long, basestring, unicode = int, str, str
    _mybytes = lambda s: bytes(s, 'utf8')  # 'ascii')
    _mystr = lambda s: str(s, 'utf8')
    _in_py3 = True

[docs]def BoolStr(obj):
    return(obj and 'TRUE' or 'FALSE')


[docs]def ReprStr(obj):
    return(repr(obj))


[docs]def FloatStr(f):
    if f is numpy.NaN or f is numpy.nan:
        return('NaN') # or 'NA'
    if pandas.isnull(f):
        return('NaN')
    if numpy.isposinf(f):
        return('Inf')
    if numpy.isneginf(f):
        return('-Inf')
    return(repr(f))


[docs]def LongStr(obj):
    rv = repr(obj)
    if rv[-1] == 'L':
        rv = rv[:-1]
    return(rv)


[docs]def ComplexStr(obj):
    return(repr(obj).replace('j', 'i'))


[docs]def UniStr(obj):
    return(repr(obj.encode('utf8')))


[docs]def ByteStr(obj):
    return(repr(obj)[1:])
    #return obj.decode()


[docs]def SeqStr(obj, head='c(', tail=')', enclose=True):
    if not enclose: # don't add head and tail
        return(','.join(map(Str4R, obj)))
    if not obj:
        return(head + tail)
    # detect types
    if isinstance(obj, set):
        obj = list(obj)
    obj0 = obj[0]
    tp0 = type(obj0)
    simple_types = [str, bool, int, long, float, complex]
    num_types = [int, long, float, complex]
    is_int = tp0 in (int, long) # token for explicit converstion to integer in R since R treat an integer from stdin as double
    if tp0 not in simple_types:
        head = 'list('
    else:
        tps = isinstance(obj0, basestring) and [StringType] or isinstance(obj0, bool) and [BooleanType] or num_types
        for i in obj[1:]:
            tp = type(i)
            if tp not in tps:
                head = 'list('
                is_int = False
                break
            elif is_int and tp not in (int, long):
                is_int = False
    # convert
    return((is_int and 'as.integer(' or '') + head + ','.join(map(Str4R, obj)) + tail + (is_int and ')' or ''))


def DictStr(obj):
    return('list(' + ','.join(['%s=%s' % (Str4R(a[0]), Str4R(a[1])) for a in obj.items()]) + ')')

# 'b':boo
# lean, 'i':integer, 'u':unsigned int, 'f':float, c complex-float
# 'S'/'a':string, 'U':unicode, 'V':raw data. 'O':string?
_tpdic = {'i':'as.integer(c(%s))', 'u':'as.integer(c(%s))', 'f':'as.double(c(%s))', 'c':'as.complex(c(%s))',
        'b':'c(%s)', 'S':'c(%s)', 'a':'c(%s)', 'U':'c(%s)', 'V':'list(%s)', 'O':'as.character(c(%s))'}
[docs]def getVec(ary):
    # used for objects from numpy and pandas
    tp = ary.dtype.kind
    if len(ary.shape) > 1:
        ary = ary.reshape(reduce(lambda a,b=1: a*b, ary.shape))
    ary = ary.tolist()
    if tp != 'V':
        return(_tpdic.get(tp, 'c(%s)') % SeqStr(ary, enclose=False))
    # record array
    ary = list(map(SeqStr, ary)) # each record will be mapped to vector or list
    # use str here instead of repr since it has already been converted to str by SeqStr
    return(_tpdic.get(tp, 'list(%s)') % (', '.join(ary))) 


[docs]def NumpyNdarrayStr(obj):
    shp = obj.shape
    if len(shp) == 1: # to vector
        tp = obj.dtype
        if tp.kind != 'V':
            return(getVec(obj))
        # One-dimension record array will be converted to data.frame
        def mapField(f):
            ary = obj[f]
            tp = ary.dtype.kind
            return('"%s"=%s' % (f, _tpdic.get(tp, 'list(%s)') % SeqStr(ary.tolist(), enclose=False)))
        return('data.frame(%s)' % (', '.join(map(mapField, tp.names))))
    elif len(shp) == 2: # two-dimenstion array will be converted to matrix
        return('matrix(%s, nrow=%d, byrow=TRUE)' % (getVec(obj), shp[0]))
    else: # to array
        dim = list(shp[-2:]) # row, col
        dim.extend(shp[-3::-1])
        newaxis = list(range(len(shp)))
        newaxis[-2:] = [len(shp)-1, len(shp)-2]
        return('array(%s, dim=c(%s))' % (getVec(obj.transpose(newaxis)), repr(dim)[1:-1]))


[docs]def PandasSerieStr(obj):
    return('data.frame(%s=%s, row.names=%s)' % (obj.name, getVec(obj.values), getVec(obj.index)))


[docs]def PandasDataFrameStr(obj):
    # DataFrame will be converted to data.frame, have to explicitly name columns
    #return 'data.frame(%s, row.names=%s)' % (', '.join(map(lambda a,b=obj:a+'='+getVec(obj[a]), obj)), getVec(obj.index))
    s = ', '.join(map(lambda a,b=obj: '"%s"=%s' % (str(a), getVec(obj[a])), obj))
    return('data.frame(%srow.names=%s)' % (s and s+', ', getVec(obj.index)))
    s = ''
    for col in obj:
        s = s + col + '=' + getVec(obj[col]) + ', '
    # print 'data.frame(%s row.names=%s)' % (s, getVec(obj.index))
    return('data.frame(%s row.names=%s)' % (s, getVec(obj.index)))


[docs]def OtherStr(obj):
    if hasattr(obj, '__iter__'):  # for iterators
        if hasattr(obj, '__len__') and len(obj) <= 10000:
            return(SeqStr(list(obj)))
        else:  # waiting for better solution for huge-size containers
            return(SeqStr(list(obj)))
    return(repr(obj))


str_func = {
        type(None): 'NULL',
        bool: BoolStr,
        long: LongStr,
        int: repr,
        float: FloatStr,
        complex: ComplexStr,
        unicode: UniStr,
        str: repr,
        list: SeqStr,
        tuple: SeqStr,
        set: SeqStr,
        frozenset: SeqStr,
        dict: DictStr} # str will override uncode in Python 3

base_tps = [type(None), bool, int, long, float, complex, str, unicode, list,
        tuple, set, frozenset, dict] # use type(None) instead of NoneType since
        #the latter cannot be found in the types module in Python 3

str_func[numpy.ndarray] = NumpyNdarrayStr
base_tps.append(numpy.ndarray)

str_func.update({pandas.Series: PandasSerieStr, pandas.DataFrame: PandasDataFrameStr})
base_tps.extend([pandas.Series, pandas.DataFrame])
base_tps.reverse()

if _in_py3:
    base_tps.append(bytes)
    str_func[bytes] = ByteStr


[docs]def Str4R(obj):
    """
    convert a Python basic object into an R object in the form of string.
    """
    # for objects known by PypeR
    if type(obj) in str_func:
        return(str_func[type(obj)](obj))
    # for objects derived from basic data types
    for tp in base_tps:
        if isinstance(obj, tp):
            return(str_func[tp](obj))
    # for any other objects
    return(OtherStr(obj))