# -*- python -*-
#
# This file is part of biokit software
#
# Copyright (c) 2014-
#
# File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# http://www.gnu.org/licenses/gpl-3.0.html
#
# website: https://github.com/biokit
#
##############################################################################
import sys
from types import *
from easydev import check_param_in_list
import pandas
import numpy
__all__ = ["BoolStr", "ReprStr", "FloatStr", "LongStr", "ComplexStr", "UniStr",
"ByteStr", "SeqStr", "getVec", "NumpyNdarrayStr", "PandasDataFrameStr",
"PandasSerieStr", "OtherStr", "Str4R" ]
if sys.version < '3.0':
_mystr = _mybytes = lambda s: s
_in_py3 = False
else:
from functools import reduce
long, basestring, unicode = int, str, str
_mybytes = lambda s: bytes(s, 'utf8') # 'ascii')
_mystr = lambda s: str(s, 'utf8')
_in_py3 = True
[docs]def BoolStr(obj):
return(obj and 'TRUE' or 'FALSE')
[docs]def ReprStr(obj):
return(repr(obj))
[docs]def FloatStr(f):
if f is numpy.NaN or f is numpy.nan:
return('NaN') # or 'NA'
if pandas.isnull(f):
return('NaN')
if numpy.isposinf(f):
return('Inf')
if numpy.isneginf(f):
return('-Inf')
return(repr(f))
[docs]def LongStr(obj):
rv = repr(obj)
if rv[-1] == 'L':
rv = rv[:-1]
return(rv)
[docs]def ComplexStr(obj):
return(repr(obj).replace('j', 'i'))
[docs]def UniStr(obj):
return(repr(obj.encode('utf8')))
[docs]def ByteStr(obj):
return(repr(obj)[1:])
#return obj.decode()
[docs]def SeqStr(obj, head='c(', tail=')', enclose=True):
if not enclose: # don't add head and tail
return(','.join(map(Str4R, obj)))
if not obj:
return(head + tail)
# detect types
if isinstance(obj, set):
obj = list(obj)
obj0 = obj[0]
tp0 = type(obj0)
simple_types = [str, bool, int, long, float, complex]
num_types = [int, long, float, complex]
is_int = tp0 in (int, long) # token for explicit converstion to integer in R since R treat an integer from stdin as double
if tp0 not in simple_types:
head = 'list('
else:
tps = isinstance(obj0, basestring) and [StringType] or isinstance(obj0, bool) and [BooleanType] or num_types
for i in obj[1:]:
tp = type(i)
if tp not in tps:
head = 'list('
is_int = False
break
elif is_int and tp not in (int, long):
is_int = False
# convert
return((is_int and 'as.integer(' or '') + head + ','.join(map(Str4R, obj)) + tail + (is_int and ')' or ''))
def DictStr(obj):
return('list(' + ','.join(['%s=%s' % (Str4R(a[0]), Str4R(a[1])) for a in obj.items()]) + ')')
# 'b':boo
# lean, 'i':integer, 'u':unsigned int, 'f':float, c complex-float
# 'S'/'a':string, 'U':unicode, 'V':raw data. 'O':string?
_tpdic = {'i':'as.integer(c(%s))', 'u':'as.integer(c(%s))', 'f':'as.double(c(%s))', 'c':'as.complex(c(%s))',
'b':'c(%s)', 'S':'c(%s)', 'a':'c(%s)', 'U':'c(%s)', 'V':'list(%s)', 'O':'as.character(c(%s))'}
[docs]def getVec(ary):
# used for objects from numpy and pandas
tp = ary.dtype.kind
if len(ary.shape) > 1:
ary = ary.reshape(reduce(lambda a,b=1: a*b, ary.shape))
ary = ary.tolist()
if tp != 'V':
return(_tpdic.get(tp, 'c(%s)') % SeqStr(ary, enclose=False))
# record array
ary = list(map(SeqStr, ary)) # each record will be mapped to vector or list
# use str here instead of repr since it has already been converted to str by SeqStr
return(_tpdic.get(tp, 'list(%s)') % (', '.join(ary)))
[docs]def NumpyNdarrayStr(obj):
shp = obj.shape
if len(shp) == 1: # to vector
tp = obj.dtype
if tp.kind != 'V':
return(getVec(obj))
# One-dimension record array will be converted to data.frame
def mapField(f):
ary = obj[f]
tp = ary.dtype.kind
return('"%s"=%s' % (f, _tpdic.get(tp, 'list(%s)') % SeqStr(ary.tolist(), enclose=False)))
return('data.frame(%s)' % (', '.join(map(mapField, tp.names))))
elif len(shp) == 2: # two-dimenstion array will be converted to matrix
return('matrix(%s, nrow=%d, byrow=TRUE)' % (getVec(obj), shp[0]))
else: # to array
dim = list(shp[-2:]) # row, col
dim.extend(shp[-3::-1])
newaxis = list(range(len(shp)))
newaxis[-2:] = [len(shp)-1, len(shp)-2]
return('array(%s, dim=c(%s))' % (getVec(obj.transpose(newaxis)), repr(dim)[1:-1]))
[docs]def PandasSerieStr(obj):
return('data.frame(%s=%s, row.names=%s)' % (obj.name, getVec(obj.values), getVec(obj.index)))
[docs]def PandasDataFrameStr(obj):
# DataFrame will be converted to data.frame, have to explicitly name columns
#return 'data.frame(%s, row.names=%s)' % (', '.join(map(lambda a,b=obj:a+'='+getVec(obj[a]), obj)), getVec(obj.index))
s = ', '.join(map(lambda a,b=obj: '"%s"=%s' % (str(a), getVec(obj[a])), obj))
return('data.frame(%srow.names=%s)' % (s and s+', ', getVec(obj.index)))
s = ''
for col in obj:
s = s + col + '=' + getVec(obj[col]) + ', '
# print 'data.frame(%s row.names=%s)' % (s, getVec(obj.index))
return('data.frame(%s row.names=%s)' % (s, getVec(obj.index)))
[docs]def OtherStr(obj):
if hasattr(obj, '__iter__'): # for iterators
if hasattr(obj, '__len__') and len(obj) <= 10000:
return(SeqStr(list(obj)))
else: # waiting for better solution for huge-size containers
return(SeqStr(list(obj)))
return(repr(obj))
str_func = {
type(None): 'NULL',
bool: BoolStr,
long: LongStr,
int: repr,
float: FloatStr,
complex: ComplexStr,
unicode: UniStr,
str: repr,
list: SeqStr,
tuple: SeqStr,
set: SeqStr,
frozenset: SeqStr,
dict: DictStr} # str will override uncode in Python 3
base_tps = [type(None), bool, int, long, float, complex, str, unicode, list,
tuple, set, frozenset, dict] # use type(None) instead of NoneType since
#the latter cannot be found in the types module in Python 3
str_func[numpy.ndarray] = NumpyNdarrayStr
base_tps.append(numpy.ndarray)
str_func.update({pandas.Series: PandasSerieStr, pandas.DataFrame: PandasDataFrameStr})
base_tps.extend([pandas.Series, pandas.DataFrame])
base_tps.reverse()
if _in_py3:
base_tps.append(bytes)
str_func[bytes] = ByteStr
[docs]def Str4R(obj):
"""
convert a Python basic object into an R object in the form of string.
"""
# for objects known by PypeR
if type(obj) in str_func:
return(str_func[type(obj)](obj))
# for objects derived from basic data types
for tp in base_tps:
if isinstance(obj, tp):
return(str_func[tp](obj))
# for any other objects
return(OtherStr(obj))