port of Israel Ekpo\'s CSV parser library

Dependents:   parser_sample IoTGateway_Basic

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers csv_parser.h Source File

csv_parser.h

00001 /*
00002 
00003 Copyright (c) 2008 - 2009, Israel Ekpo
00004 All rights reserved.
00005 
00006 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
00007 
00008     * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
00009     * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
00010     * Neither the name of Israel Ekpo nor the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission.
00011 
00012 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00013 
00014 
00015 */
00016 /**
00017  * csv_parser Header File
00018  *
00019  * This object is used to parse text documents that are delimited by some
00020  * type of character. Some of the common ones use spaces, tabs, commas and semi-colons.
00021  *
00022  * This is a list of common characters encountered by this program
00023  *
00024  * This list was prepared from the data from http://www.asciitable.com
00025  *
00026  * @li DEC is how it would be represented in decimal form (base 10)
00027  * @li HEX is how it would be represented in hexadecimal format (base 16)
00028  *
00029  * @li    DEC    HEX        Character Name
00030  * @li    0    0x00    null
00031  * @li    9    0x09    horizontal tab
00032  * @li    10    0x0A    line feed, new line
00033  * @li    13    0x0D    carriage return
00034  * @li    27    0x1B    escape
00035  * @li    32    0x20    space
00036  * @li    33    0x21    double quote
00037  * @li    39    0x27    single quote
00038  * @li    44    0x2C    comma
00039  * @li    92    0x5C    backslash
00040  *
00041  * @author Israel Ekpo <israel.ekpo@israelekpo.com>
00042  */
00043 
00044 #ifndef CSV_PARSER_HPP_INCLUDED
00045 
00046 #define CSV_PARSER_HPP_INCLUDED
00047 
00048 #define LIBCSV_PARSER_MAJOR_VERSION 1
00049 
00050 #define LIBCSV_PARSER_MINOR_VERSION 0
00051 
00052 #define LIBCSV_PARSER_PATCH_VERSION 0
00053 
00054 #define LIBCSV_PARSER_VERSION_NUMBER 10000
00055 
00056 /* C++ header files */
00057 #include <string>
00058 #include <vector>
00059 
00060 /* C header files */
00061 #include <cstdio>
00062 #include <cstring>
00063 #include <cstdlib>
00064 
00065 using namespace std;
00066 
00067 /**
00068  * @typedef csv_row
00069  *
00070  * Data structure used to represent a record.
00071  *
00072  * This is an alias for vector <string>
00073  */
00074 typedef vector <string> csv_row;
00075 
00076 /**
00077  * @typedef csv_row_ptr
00078  *
00079  * Pointer to a csv_row object
00080  *
00081  * Expands to vector <string> *
00082  */
00083 typedef csv_row * csv_row_ptr;
00084 
00085 /**
00086  * @typedef enclosure_type_t
00087  *
00088  * This enum type is used to set the mode in which the CSV file is parsed.
00089  *
00090  * @li ENCLOSURE_NONE         (1) means the CSV file does not use any enclosure characters for the fields
00091  * @li ENCLOSURE_REQUIRED     (2) means the CSV file requires enclosure characters for all the fields
00092  * @li ENCLOSURE_OPTIONAL     (3) means the use of enclosure characters for the fields is optional
00093  *
00094  * The ENCLOSURE_TYPE_BEGIN and ENCLOSURE_TYPE_END members of this enum definition are never to be used.
00095  */
00096 typedef enum
00097 {
00098     ENCLOSURE_TYPE_BEGIN = 0,
00099     ENCLOSURE_NONE       = 1,
00100     ENCLOSURE_REQUIRED   = 2,
00101     ENCLOSURE_OPTIONAL   = 3,
00102     ENCLOSURE_TYPE_END
00103 
00104 } enclosure_type_t;
00105 
00106 /**
00107  * @def CSV_PARSER_FREE_BUFFER_PTR(ptr)
00108  *
00109  * Used to deallocate buffer pointers
00110  *
00111  * It deallocates the pointer only if it is not null
00112  */
00113 #define CSV_PARSER_FREE_BUFFER_PTR(ptr)    \
00114 if (ptr != NULL)                        \
00115 {                                        \
00116     free(ptr);                            \
00117                                         \
00118     ptr = NULL;                            \
00119 }
00120 
00121 /**
00122  * @def CSV_PARSER_FREE_FILE_PTR(fptr)
00123  *
00124  * Used to close open file handles
00125  *
00126  * It closes the file only if it is not null
00127  */
00128 #define CSV_PARSER_FREE_FILE_PTR(fptr)    \
00129 if (fptr != NULL)                        \
00130 {                                        \
00131     fclose(fptr);                        \
00132                                         \
00133     fptr = NULL;                        \
00134 }
00135 
00136 /**
00137  * @class csv_parser
00138  *
00139  * The csv_parser object
00140  *
00141  * Used to parse text files to extract records and fields.
00142  *
00143  * We are making the following assumptions :
00144  *
00145  * @li The record terminator is only one character in length.
00146  * @li The field terminator is only one character in length.
00147  * @li The fields are enclosed by single characters, if any.
00148  *
00149  * @li The parser can handle documents where fields are always enclosed, not enclosed at all or optionally enclosed.
00150  * @li When fields are strictly all enclosed, there is an assumption that any enclosure characters within the field are escaped by placing a backslash in front of the enclosure character.
00151  *
00152  * The CSV files can be parsed in 3 modes.
00153  * @li (a) No enclosures
00154  * @li (b) Fields always enclosed.
00155  * @li (c) Fields optionally enclosed.
00156  *
00157  * For option (c) when the enclosure character is optional, if an enclosure character is spotted at either the beginning
00158  * or the end of the string, it is assumed that the field is enclosed.
00159  *
00160  * The csv_parser::init() method can accept a character array as the path to the CSV file.
00161  * Since it is overloaded, it can also accept a FILE pointer to a stream that is already open for reading.
00162  *
00163  * The set_enclosed_char() method accepts the field enclosure character as the first parameter and the enclosure mode as the second parameter which
00164  * controls how the text file is going to be parsed.
00165  *
00166  * @see csv_parser::set_enclosed_char()
00167  * @see enclosure_type_t
00168  *
00169  * @todo Add ability to parse files where fields/columns are terminated by strings instead of just one char.
00170  * @todo Add ability to set strings where lines start by. Currently lines do not have any starting char or string.
00171  * @todo Add ability to set strings where line end by. Currently lines can only end with a single char.
00172  * @todo Add ability to accept other escape characters besides the backslash character 0x5C.
00173  * @todo More support for improperly formatted CSV data files.
00174  *
00175  * @author Israel Ekpo <israel.ekpo@israelekpo.com>
00176  */
00177 class csv_parser
00178 {
00179 
00180 public :
00181 
00182     /**
00183      * Class constructor
00184      *
00185      * This is the default constructor.
00186      *
00187      * All the internal attributes are initialized here
00188      *
00189      * @li The enclosure character is initialized to NULL 0x00.
00190      * @li The escape character is initialized to the backslash character 0x5C.
00191      * @li The field delimiter character is initialized to a comma 0x2C.
00192      * @li The record delimiter character is initialized to a new line character 0x0A.
00193      *
00194      * @li The lengths of all the above-mentioned fields are initialized to 0,1,1 and 1 respectively.
00195      * @li The number of records to ignore is set to zero.
00196      * @li The more_rows internal attribute is set to false.
00197      * @li The pointer to the CSV input file is initialized to NULL
00198      * @li The pointer to the buffer for the file name is also initialized to NULL
00199      */
00200     csv_parser() : enclosed_char(0x00),     escaped_char(0x5C),
00201                    field_term_char(0x2C),      line_term_char(0x0A),
00202                    enclosed_length(0U),        escaped_length(1U),
00203                    field_term_length(1U),      line_term_length(1U),
00204                    ignore_num_lines(0U),       record_count(0U),
00205                    input_fp(NULL),               input_filename(NULL),
00206                    enclosure_type(ENCLOSURE_NONE),
00207                    more_rows(false)
00208                    { }
00209 
00210     /**
00211      * Class destructor
00212      *
00213      * In the class destructor the file pointer to the input CSV file is closed and
00214      * the buffer to the input file name is also deallocated.
00215      *
00216      * @see csv_parser::input_fp
00217      * @see csv_parser::input_filename
00218      */
00219     ~csv_parser()
00220     {
00221         CSV_PARSER_FREE_FILE_PTR(input_fp);
00222 
00223         CSV_PARSER_FREE_BUFFER_PTR(input_filename);
00224     }
00225 
00226     /**
00227      * Initializes the current object
00228      *
00229      * This init method accepts a pointer to the CSV file that has been opened for reading
00230      *
00231      * It also resets the file pointer to the beginning of the stream
00232      *
00233      * @overload bool init(FILE * input_file_pointer)
00234      * @param[in] input_file_pointer
00235      * @return bool Returns true on success and false on error.
00236      */
00237     bool init(FILE * input_file_pointer);
00238 
00239     /**
00240      * Initializes the current object
00241      *
00242      * @li This init method accepts a character array as the path to the csv file.
00243      * @li It sets the value of the csv_parser::input_filename property.
00244      * @li Then it creates a pointer to the csv_parser::input_fp property.
00245      *
00246      * @overload bool init(const char * input_filename)
00247      * @param[in] input_filename
00248      * @return bool Returns true on success and false on error.
00249      */
00250     bool init(const char * input_filename);
00251 
00252     /**
00253      * Defines the Field Enclosure character used in the Text File
00254      *
00255      * Setting this to NULL means that the enclosure character is optional.
00256      *
00257      * If the enclosure is optional, there could be fields that are enclosed, and fields that are not enclosed within the same line/record.
00258      *
00259      * @param[in] fields_enclosed_by The character used to enclose the fields.
00260      * @param[in] enclosure_mode How the CSV file should be parsed.
00261      * @return void
00262      */
00263     void set_enclosed_char(char fields_enclosed_by, enclosure_type_t enclosure_mode);
00264 
00265     /**
00266      * Defines the Field Delimiter character used in the text file
00267      *
00268      * @param[in] fields_terminated_by
00269      * @return void
00270      */
00271     void set_field_term_char(char fields_terminated_by);
00272 
00273     /**
00274      * Defines the Record Terminator character used in the text file
00275      *
00276      * @param[in] lines_terminated_by
00277      * @return void
00278      */
00279     void set_line_term_char(char lines_terminated_by);
00280 
00281     /**
00282      * Returns whether there is still more data
00283      *
00284      * This method returns a boolean value indicating whether or not there are
00285      * still more records to be extracted in the current file being parsed.
00286      *
00287      * Call this method to see if there are more rows to retrieve before invoking csv_parser::get_row()
00288      *
00289      * @see csv_parser::get_row()
00290      * @see csv_parser::more_rows
00291      *
00292      * @return bool Returns true if there are still more rows and false if there is not.
00293      */
00294     bool has_more_rows(void)
00295     {
00296         return more_rows;
00297     }
00298 
00299     /**
00300      * Defines the number of records to discard
00301      *
00302      * The number of records specified will be discarded during the parsing process.
00303      *
00304      * @see csv_parser::_skip_lines()
00305      * @see csv_parser::get_row()
00306      * @see csv_parser::has_more_rows()
00307      *
00308      * @param[in] lines_to_skip How many records should be skipped
00309      * @return void
00310      */
00311     void set_skip_lines(unsigned int lines_to_skip)
00312     {
00313         ignore_num_lines = lines_to_skip;
00314     }
00315 
00316     /**
00317      * Return the current row from the CSV file
00318      *
00319      * The row is returned as a vector of string objects.
00320      *
00321      * This method should be called only if csv_parser::has_more_rows() is true
00322      *
00323      * @see csv_parser::has_more_rows()
00324      * @see csv_parser::get_record_count()
00325      * @see csv_parser::reset_record_count()
00326      * @see csv_parser::more_rows
00327      *
00328      * @return csv_row A vector type containing an array of strings
00329      */
00330     csv_row get_row(void);
00331 
00332     /**
00333      * Returns the number of times the csv_parser::get_row() method has been invoked
00334      *
00335      * @see csv_parser::reset_record_count()
00336      * @return unsigned int The number of times the csv_parser::get_row() method has been invoked.
00337      */
00338     unsigned int get_record_count(void)
00339     {
00340         return record_count;
00341     }
00342 
00343     /**
00344      * Resets the record_count internal attribute to zero
00345      *
00346      * This may be used if the object is reused multiple times.
00347      *
00348      * @see csv_parser::record_count
00349      * @see csv_parser::get_record_count()
00350      * @return void
00351      */
00352     void reset_record_count(void)
00353     {
00354         record_count = 0U;
00355     }
00356 
00357 private :
00358 
00359     /**
00360      * Ignores N records in the CSV file
00361      *
00362      * Where N is the value of the csv_parser::ignore_num_lines internal property.
00363      *
00364      * The number of lines skipped can be defined by csv_parser::set_skip_lines()
00365      *
00366      * @see csv_parser::set_skip_lines()
00367      *
00368      * @return void
00369      */
00370     void _skip_lines(void);
00371 
00372     /**
00373      * Reads a Single Line
00374      *
00375      * Reads a single record into the buffer passed by reference to the method
00376      *
00377      * @param[in,out] buffer A pointer to a character array for the current line.
00378      * @param[out] buffer_len A pointer to an integer storing the length of the buffer.
00379      * @return void
00380      */
00381     void _read_single_line(char ** buffer, unsigned int * buffer_len);
00382 
00383     /**
00384      * Extracts the fields without enclosures
00385      *
00386      * This is used when the enclosure character is not set
00387      * @param[out] row The vector of strings
00388      * @param[in] line The character array buffer containing the current record/line
00389      * @param[in] line_length The length of the buffer
00390      */
00391     void _get_fields_without_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length);
00392 
00393     /**
00394      * Extracts the fields with enclosures
00395      *
00396      * This is used when the enclosure character is set.
00397      *
00398      * @param[out] row The vector of strings
00399      * @param[in] line The character array buffer containing the current record/line
00400      * @param[in] line_length The length of the buffer
00401      */
00402     void _get_fields_with_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length);
00403 
00404     /**
00405      * Extracts the fields when enclosure is optional
00406      *
00407      * This is used when the enclosure character is optional
00408      *
00409      * Hence, there could be fields that use it, and fields that don't.
00410      *
00411      * @param[out] row The vector of strings
00412      * @param[in] line The character array buffer containing the current record/line
00413      * @param[in] line_length The length of the buffer
00414      */
00415     void _get_fields_with_optional_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length);
00416 
00417 protected :
00418 
00419     /**
00420      * The enclosure character
00421      *
00422      * If present or used for a field it is assumed that both ends of the fields are wrapped.
00423      *
00424      * This is that single character used in the document to wrap the fields.
00425      *
00426      * @see csv_parser::_get_fields_without_enclosure()
00427      * @see csv_parser::_get_fields_with_enclosure()
00428      * @see csv_parser::_get_fields_with_optional_enclosure()
00429      *
00430      * @var enclosed_char
00431      */
00432     char enclosed_char;
00433 
00434     /**
00435      * The escape character
00436      *
00437      * For now the only valid escape character allowed is the backslash character 0x5C
00438      *
00439      * This is only important when the enclosure character is required or optional.
00440      *
00441      * This is the backslash character used to escape enclosure characters found within the fields.
00442      *
00443      * @see csv_parser::_get_fields_with_enclosure()
00444      * @see csv_parser::_get_fields_with_optional_enclosure()
00445      * @todo Update the code to accept other escape characters besides the backslash
00446      *
00447      * @var escaped_char
00448      */
00449     char escaped_char;
00450 
00451     /**
00452      * The field terminator
00453      *
00454      * This is the single character used to mark the end of a column in the text file.
00455      *
00456      * Common characters used include the comma, tab, and semi-colons.
00457      *
00458      * This is the single character used to separate fields within a record.
00459      *
00460      * @var field_term_char
00461      */
00462     char field_term_char;
00463 
00464     /**
00465      * The record terminator
00466      *
00467      * This is the single character used to mark the end of a record in the text file.
00468      *
00469      * The most popular one is the new line character however it is possible to use others as well.
00470      *
00471      * This is the single character used to mark the end of a record
00472      *
00473      * @see csv_parser::get_row()
00474      *
00475      * @var line_term_char
00476      */
00477     char line_term_char;
00478 
00479     /**
00480      * Enclosure length
00481      *
00482      * This is the length of the enclosure character
00483      *
00484      * @see csv_parser::csv_parser()
00485      * @see csv_parser::set_enclosed_char()
00486      *
00487      * @var enclosed_length
00488      */
00489     unsigned int enclosed_length;
00490 
00491     /**
00492      * The length of the escape character
00493      *
00494      * Right now this is really not being used.
00495      *
00496      * It may be used in future versions of the object.
00497      *
00498      * @todo Update the code to accept other escape characters besides the backslash
00499      *
00500      * @var escaped_length
00501      */
00502     unsigned int escaped_length;
00503 
00504     /**
00505      * Length of the field terminator
00506      *
00507      * For now this is not being used. It will be used in future versions of the object.
00508      *
00509      * @var field_term_length
00510      */
00511     unsigned int field_term_length;
00512 
00513     /**
00514      * Length of the record terminator
00515      *
00516      * For now this is not being used. It will be used in future versions of the object.
00517      *
00518      * @var line_term_length
00519      */
00520     unsigned int line_term_length;
00521 
00522     /**
00523      * Number of records to discard
00524      *
00525      * This variable controls how many records in the file are skipped before parsing begins.
00526      *
00527      * @see csv_parser::_skip_lines()
00528      * @see csv_parser::set_skip_lines()
00529      *
00530      * @var ignore_num_lines
00531      */
00532     unsigned int ignore_num_lines;
00533 
00534     /**
00535      * Number of times the get_row() method has been called
00536      *
00537      * @see csv_parser::get_row()
00538      * @var record_count
00539      */
00540     unsigned int record_count;
00541 
00542     /**
00543      * The CSV File Pointer
00544      *
00545      * This is the pointer to the CSV file
00546      *
00547      * @var input_fp
00548      */
00549     FILE * input_fp;
00550 
00551     /**
00552      * Buffer to input file name
00553      *
00554      * This buffer is used to store the name of the file that is being parsed
00555      *
00556      * @var input_filename
00557      */
00558     char * input_filename;
00559 
00560     /**
00561      * Mode in which the CSV file will be parsed
00562      *
00563      * The various values are explained below
00564      *
00565      * @li ENCLOSURE_NONE         (1) means the CSV file does not use any enclosure characters for the fields
00566      * @li ENCLOSURE_REQUIRED     (2) means the CSV file requires enclosure characters for all the fields
00567      * @li ENCLOSURE_OPTIONAL     (3) means the use of enclosure characters for the fields is optional
00568      *
00569      * @see csv_parser::get_row()
00570      * @see csv_parser::_read_single_line()
00571      * @see csv_parser::_get_fields_without_enclosure()
00572      * @see csv_parser::_get_fields_with_enclosure()
00573      * @see csv_parser::_get_fields_with_optional_enclosure()
00574      *
00575      * @var enclosure_type
00576      */
00577     enclosure_type_t enclosure_type;
00578 
00579     /**
00580      * There are still more records to parse
00581      *
00582      * This boolean property is an internal indicator of whether there are still records in the
00583      * file to be parsed.
00584      *
00585      * @see csv_parser::has_more_rows()
00586      * @var more_rows
00587      */
00588     bool more_rows;
00589 
00590 }; /* class csv_parser */
00591 
00592 #endif /* CSV_PARSER_HPP_INCLUDED */