Server IP : 127.0.0.2 / Your IP : 13.58.199.13 Web Server : Apache/2.4.18 (Ubuntu) System : User : www-data ( ) PHP Version : 7.0.33-0ubuntu0.16.04.16 Disable Function : disk_free_space,disk_total_space,diskfreespace,dl,exec,fpaththru,getmyuid,getmypid,highlight_file,ignore_user_abord,leak,listen,link,opcache_get_configuration,opcache_get_status,passthru,pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,php_uname,phpinfo,posix_ctermid,posix_getcwd,posix_getegid,posix_geteuid,posix_getgid,posix_getgrgid,posix_getgrnam,posix_getgroups,posix_getlogin,posix_getpgid,posix_getpgrp,posix_getpid,posix,_getppid,posix_getpwnam,posix_getpwuid,posix_getrlimit,posix_getsid,posix_getuid,posix_isatty,posix_kill,posix_mkfifo,posix_setegid,posix_seteuid,posix_setgid,posix_setpgid,posix_setsid,posix_setuid,posix_times,posix_ttyname,posix_uname,pclose,popen,proc_open,proc_close,proc_get_status,proc_nice,proc_terminate,shell_exec,source,show_source,system,virtual MySQL : OFF | cURL : ON | WGET : ON | Perl : ON | Python : ON | Sudo : ON | Pkexec : ON Directory : /usr/lib/python2.7/dist-packages/mx/Misc/ |
Upload File : |
""" CSV (Comma Separated Values) table format reader/writer. The two classes read and write CSV data. It's a fairly simple format for data exchange with nearly all spreadsheets, databases, organizers, etc. The reader class is built to be easy on the data passed in: small errors like missing commas don't cause an exception (but do set a variable to indicate this). Data is converted from CSV text into a an internal format which can then be extracted into different forms of table representation. The writer writes standard CSV files and knows about quoting rules, separator handling etc. so that interfacing to spreadsheets and databases should pose no problem. Both classes can be subclassed to enhance/modify their behaviour. Copyright (c) 2000-2005, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2015, eGenix.com Software GmbH; mailto:info@egenix.com See the documentation for further information on copyrights, or contact the author. All Rights Reserved. """ __version__ = '1.0' _debug = 0 import re,sys,types,exceptions from mx import TextTools, Tools ### Globals # Singleton for parameters which were not provided to a call NotGiven = 'NotGiven' ### Errors class Error(exceptions.StandardError): pass ### Reader base class class Reader: # Did decoding have errors ? errors = 0 # If there were errors, then this list contains the line number of # the lines with errors errorlines = None # List of lines, each being a list of strings lines = None # Width of the received data (max. number of entries per line) width = 0 # List of column names found in the data's first row; possibly # filtered through .filter_header() columns = None # String of separator characters which are used to separate items # on an input line separators = ',;\t' # My version, which runs in exponential time for '"abcdefgh...' # parseitem = re.compile(r'"([^"]+|"")*"|[^",;\015\012]*') # Thanks to Andrew Kuchling for helping me with this "simple" RE parseitem = re.compile(r'"([^"]|"")*"|[^"' + separators + r'\015\012]*') # Encoding; if set to None, no decoding takes place, otherwise all # data is converted to Unicode using this encoding encoding = None def __init__(self, separators=None, encoding=NotGiven): if separators is not None: self.parseitem = re.compile(r'"([^"]|"")*"|[^"' + separators + r'\015\012]*') if encoding is not NotGiven: self.encoding = encoding def flush(self): """ Empty the object and reset errors """ self.lines = None self.width = 0 self.errors = 0 self.columns = None def load(self, file, header=1, columns=None, StringType=types.StringType): """ Read a file. If header is true (default), the first line of input is interpreted as list of column names. If columns is given as list of strings, these columns names are used. If both header and columns are used, columns overrides the columns set by reading the header line. This is useful to override header information from the input data. """ if type(file) == StringType: file = open(file,'rb') text = file.read() file.close() self.lines = self._decode(text) if header: self.process_header() if columns is not None: self.set_columns(columns) def loads(self, text, header=1, columns=None): """ Read the given text If header is true (default), the first line of input is interpreted as list of column names. If columns is given as list of strings, these columns names are used. If both header and columns are used, columns overrides the columns set by reading the header line. This is useful to override header information from the input data. """ self.lines = self._decode(text) if header: self.process_header() if columns is not None: self.set_columns(columns) def list(self): """ Return the current data as list of lists, each having self.width string entries. Missing entries are set to None. """ width = self.width lines = self.lines table = [None] * len(lines) for i, row in Tools.irange(lines): row = row[:] if len(row) < width: row[len(row):] = [None]*(width-len(row)) table[i] = row return table def dictionary(self): """ Return the current data as dictionary of lists of strings, with one entry for each column. .columns must have been set using .set_columns() or by processing a given CSV header. """ table = {} lines = self.lines keys = self.columns if keys is None: raise Error,'no columns set' rows = len(lines) for k in keys: table[k] = [None] * rows for i, key in Tools.irange(keys): column = table[key] for j, row in Tools.irange(lines): if len(row) > i: column[j] = row[i] return table def objects(self,constructor): """ Builds a list of objects by calling the given constructor with keywords defined by mapping column names to values for each input line. .columns must have been set using .set_columns() or by processing a given CSV header. """ lines = self.lines keys = self.columns if keys is None: raise Error,'no columns set' objs = [None] * len(lines) for i,line in Tools.irange(lines): kws = dict(Tools.tuples(keys, line)) objs[i] = apply(constructor,(),kws) return objs def process_header(self): """ Process the header data. This also sets the .columns attribute. The header is removed from the data in .lines after having been processed. The header is passed through .filter_header() before interpreting it as list of column names. """ lines = self.lines if len(lines) < 1: raise Error,'missing header data' self.columns = self.filter_header(lines[0]) del lines[0] def set_columns(self, columns): """ Sets the column names to use. This overrides any column names possibly given in the read data. """ self.columns = columns def filter_header(self, header, lower=TextTools.lower): """ Filter the given header line. The base class converts the column names to all lowercase and removes any whitespace included in the header. This method is only called in case the header was read from the data provided to the object. """ l = [''] * len(header) for i,column in Tools.irange(header): l[i] = ''.join(lower(column).split()) return l def description(self, header=1): """ Return a list of tuples (column name, max length) found in the data. If header is true (default), the column names themselves are included in the calculation. """ lines = self.lines columns = self.columns width = len(columns) if header: lengths = [] for column in columns: lengths.append(len(column)) else: lengths = [0] * width for row in self.lines: for i,o in Tools.irange(row[:width]): if len(o) > lengths[i]: lengths[i] = len(o) return map(None,columns,lengths) def _decode(self,text): """ Decode the CSV data in text. Internal method. Do not use directly. """ lines = [] x = 0 length = len(text) width = 0 errorlines = [] match = self.parseitem.match separators = self.separators while x < length: l = [] while x <= length: # Find next token m = match(text,x) if not m: # End of line break y = m.regs[0][1] value = text[x:y] if self.encoding is not None: # Convert to Unicode, if requested value = unicode(value, self.encoding) l.append(value) if _debug: print x,repr(l[-1]) x = y + 1 if x > length: break # Check validity if text[y:x] not in separators: if text[y:y+2] == '\015\012': # Handle CRLF x = y + 2 elif text[y:x] not in '\015\012': # Syntax error: missing ',' or ';' # Action: skip to end of line y = text.find('\012', x) if y < 0: y = text.find('\015', x) if y < 0: # Skip to end of text x = length else: x = y + 1 if _debug: print 'errors in',l,x errorlines.append(len(lines)) # else: found single CR or LF break if len(l) > width: width = len(l) if _debug: print 'adding',l,x,repr(text[x:x+5]) lines.append(l) self.width = width if errorlines: self.errors = 1 self.errorlines = errorlines return map(self._unquote,lines) def _unquote(self,line): """ Unquote a CSV style quoted line of text. Internal method. Do not use directly. """ for i,text in Tools.irange(line): if text[:1] == '"' and text[-1:] == '"': text = text[1:-1] line[i] = text.replace('""','"') return line def __len__(self): return len(self.lines) def __str__(self): lines = self.list() desc = self.description() width = 0 output = [] write = output.append for col in desc: write('%-*s|' % (col[1],col[0])) write('\n') for col in desc: write('=' * col[1] + '+') write('\n') for line in lines: for i,item in Tools.irange(line): write('%-*s|' % (desc[i][1],item)) write('\n') return ''.join(output) ### Writer base class class Writer: # Column names columns = None # CSV text text = '' # Separator to use for separating fields of a row (default: comma) separator = ',' # End-of-line marker to use (default: CRLF) lineend = '\015\012' # Encoding to be used for Unicode strings encoding = 'utf-8' def __init__(self, separator=None, lineend=None, encoding=None): if separator is not None: self.separator = separator if lineend is not None: self.lineend = lineend if encoding is not None: self.encoding = encoding def flush(self): """ Flush the data currently stored in the writer. """ self.text = '' self.columns = None def set_columns(self, columns, header=1): """ Sets the output columns. If header is true, a column name line is added to the output. Columns can only be set once per session and must be set prior to adding any data. columns has to be a list of column names. It is assured that no more than len(columns) items are written for each row. All rows are filled up with "" entries to have an equal number of items. """ if columns == self.columns and not header: # Nothing to do return elif self.columns: raise Error,'cannot write columns more than once per session' self.columns = columns if header: if self.text: raise Error,'cannot add header to already written data' headerline = self._quote(columns) self.text = self.separator.join(headerline) + self.lineend def feed_list(self,table): """ Feeds a table (list of rows) which is converted to CSV. No more than len(columns) items are written for each row. All rows are filled up with "" entries to have an equal number of items. None entries are converted to empty strings, all other objects are stringified. """ columns = self.columns if columns: rowlen = len(columns) else: # Calculate the max. number of columns in the table rowlen = max(map(len,table)) # Prepare an empty table t = [None] * len(table) _quote = self._quote # Fill in data for i,row in Tools.irange(table): row = _quote(row[:rowlen]) if len(row) < rowlen: row[len(row):] = ['""'] * (rowlen - len(row)) t[i] = self.separator.join(row) # Add final CRLF and add as CSV text t.append('') self.text = self.text + self.lineend.join(t) def feed_dict(self,table,rows=None): """ Feeds a table (dict of lists) which is converted to CSV. Only the keys set as column names are used to form the CSV data. All lists in the dictionary must have equal length or at least rows number of entries, if rows is given. None entries are converted to empty strings, all other objects are stringified. """ columns = self.columns if not columns: raise Error,'no output columns set' rowlen = len(columns) # Create an emtpy table if not rows: rows = 0 for column in columns: nrows = len(table[column]) if nrows > rows: rows = nrows rowindices = Tools.trange(rows) t = [None] * rows for i in rowindices: t[i] = [None] * rowlen # Fill the table for j,k in Tools.irange(columns): for i in rowindices: t[i][j] = table[k][i] # Quote and join lines t = [self.separator.join(self._quote(x)) for x in t] # Add final CRLF and store CSV text t.append('') self.text = self.text + self.lineend.join(t) def feed_objects(self,objects, getattr=getattr): """ Feeds a sequence of objects which is converted to CSV. For each object the set column names are interpreted as object attributes and used as basis for the CSV data. None values are converted to empty strings, all other attributes are added stringified. """ columns = self.columns if not columns: raise Error,'no output columns set' rowlen = len(columns) # Create an emtpy table rows = len(objects) rowindices = Tools.trange(rows) t = [None] * rows for i in rowindices: t[i] = [None] * rowlen # Fill the table icols = Tools.irange(columns) for i in rowindices: obj = objects[i] for j,name in icols: t[i][j] = getattr(obj, name) # Quote and join lines t = [self.separator.join(self._quote(x)) for x in t] # Add final CRLF and store CSV text t.append('') self.text = self.text + self.lineend.join(t) def dumps(self): """ Returns the data as CSV text """ return self.text def dump(self, file, StringType=types.StringType): """ Write the converted CSV data to a file """ if type(file) == StringType: file = open(file,'wb') file.write(self.text) file.close() else: file.write(self.text) def _quote(self, line, str=str): """ CSV style quote the given line of text. """ nline = ['""'] * len(line) for i,item in Tools.irange(line): if item is None: text = '' elif isinstance(item, unicode): text = item.encode(self.encoding) else: text = str(item) nline[i] = '"%s"' % text.replace('"','""') return nline def _test(): import sys global _debug # Turn on debugging output #_debug = 1 s = """"key1","key2",key3,key4 "abc",0,,0,"def""ghi Eine neue Zeile 0","hallo", "line",with"errors, "text with,embedded,commas",2,3,4 newline,new,luck,;123,""\015 \"""Quote""\",1,2,3,"""#" r = Reader() r.loads(s) lt = r.list() print lt print r.dictionary() r.set_columns(('Name','Vorname')) print r.dictionary() print r.description() print '-' * 72 w = Writer() w.feed_list(r.list()) w.feed_list(r.list()) s = w.dumps() print s #w.dump('test.txt') w.flush() dict = r.dictionary() w.set_columns(dict.keys()) w.feed_dict(dict) s = w.dumps() print '-->',s r.flush() r.loads(s) lt = r.list() print lt print r.dictionary() r.set_columns(('Name','Vorname')) print r.dictionary() print r.description() print '-' * 72 class Obj: pass data = [] for i in range(10): o = Obj() o.id = i o.str = str(i) o.unicode = unicode(i) + u'\xe4\xf6\xfc' data.append(o) w.flush() w.set_columns(('id', 'str', 'unicode')) w.feed_objects(data) w.feed_objects(data) s = w.dumps() print '-->',s print 'Strings mode:' r.flush() r.loads(s) lt = r.list() print lt print r.dictionary() r.set_columns(('id','str', 'unicode')) print r.dictionary() print r.description() print 'Unicode mode:' r.flush() r.encoding = 'utf-8' r.loads(s) lt = r.list() print lt print r.dictionary() r.set_columns(('id','str', 'unicode')) print r.dictionary() print r.description() r.encoding = None print '-' * 72 print print 'Read File:',sys.argv[1] r.flush() f = open(sys.argv[1]) r.load(f) print r if __name__ == '__main__': _test()