port of Israel Ekpo\'s CSV parser library
Dependents: parser_sample IoTGateway_Basic
csv_parser.h
00001 /* 00002 00003 Copyright (c) 2008 - 2009, Israel Ekpo 00004 All rights reserved. 00005 00006 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 00007 00008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 00009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 00010 * Neither the name of Israel Ekpo nor the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission. 00011 00012 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00013 00014 00015 */ 00016 /** 00017 * csv_parser Header File 00018 * 00019 * This object is used to parse text documents that are delimited by some 00020 * type of character. Some of the common ones use spaces, tabs, commas and semi-colons. 00021 * 00022 * This is a list of common characters encountered by this program 00023 * 00024 * This list was prepared from the data from http://www.asciitable.com 00025 * 00026 * @li DEC is how it would be represented in decimal form (base 10) 00027 * @li HEX is how it would be represented in hexadecimal format (base 16) 00028 * 00029 * @li DEC HEX Character Name 00030 * @li 0 0x00 null 00031 * @li 9 0x09 horizontal tab 00032 * @li 10 0x0A line feed, new line 00033 * @li 13 0x0D carriage return 00034 * @li 27 0x1B escape 00035 * @li 32 0x20 space 00036 * @li 33 0x21 double quote 00037 * @li 39 0x27 single quote 00038 * @li 44 0x2C comma 00039 * @li 92 0x5C backslash 00040 * 00041 * @author Israel Ekpo <israel.ekpo@israelekpo.com> 00042 */ 00043 00044 #ifndef CSV_PARSER_HPP_INCLUDED 00045 00046 #define CSV_PARSER_HPP_INCLUDED 00047 00048 #define LIBCSV_PARSER_MAJOR_VERSION 1 00049 00050 #define LIBCSV_PARSER_MINOR_VERSION 0 00051 00052 #define LIBCSV_PARSER_PATCH_VERSION 0 00053 00054 #define LIBCSV_PARSER_VERSION_NUMBER 10000 00055 00056 /* C++ header files */ 00057 #include <string> 00058 #include <vector> 00059 00060 /* C header files */ 00061 #include <cstdio> 00062 #include <cstring> 00063 #include <cstdlib> 00064 00065 using namespace std; 00066 00067 /** 00068 * @typedef csv_row 00069 * 00070 * Data structure used to represent a record. 00071 * 00072 * This is an alias for vector <string> 00073 */ 00074 typedef vector <string> csv_row; 00075 00076 /** 00077 * @typedef csv_row_ptr 00078 * 00079 * Pointer to a csv_row object 00080 * 00081 * Expands to vector <string> * 00082 */ 00083 typedef csv_row * csv_row_ptr; 00084 00085 /** 00086 * @typedef enclosure_type_t 00087 * 00088 * This enum type is used to set the mode in which the CSV file is parsed. 00089 * 00090 * @li ENCLOSURE_NONE (1) means the CSV file does not use any enclosure characters for the fields 00091 * @li ENCLOSURE_REQUIRED (2) means the CSV file requires enclosure characters for all the fields 00092 * @li ENCLOSURE_OPTIONAL (3) means the use of enclosure characters for the fields is optional 00093 * 00094 * The ENCLOSURE_TYPE_BEGIN and ENCLOSURE_TYPE_END members of this enum definition are never to be used. 00095 */ 00096 typedef enum 00097 { 00098 ENCLOSURE_TYPE_BEGIN = 0, 00099 ENCLOSURE_NONE = 1, 00100 ENCLOSURE_REQUIRED = 2, 00101 ENCLOSURE_OPTIONAL = 3, 00102 ENCLOSURE_TYPE_END 00103 00104 } enclosure_type_t; 00105 00106 /** 00107 * @def CSV_PARSER_FREE_BUFFER_PTR(ptr) 00108 * 00109 * Used to deallocate buffer pointers 00110 * 00111 * It deallocates the pointer only if it is not null 00112 */ 00113 #define CSV_PARSER_FREE_BUFFER_PTR(ptr) \ 00114 if (ptr != NULL) \ 00115 { \ 00116 free(ptr); \ 00117 \ 00118 ptr = NULL; \ 00119 } 00120 00121 /** 00122 * @def CSV_PARSER_FREE_FILE_PTR(fptr) 00123 * 00124 * Used to close open file handles 00125 * 00126 * It closes the file only if it is not null 00127 */ 00128 #define CSV_PARSER_FREE_FILE_PTR(fptr) \ 00129 if (fptr != NULL) \ 00130 { \ 00131 fclose(fptr); \ 00132 \ 00133 fptr = NULL; \ 00134 } 00135 00136 /** 00137 * @class csv_parser 00138 * 00139 * The csv_parser object 00140 * 00141 * Used to parse text files to extract records and fields. 00142 * 00143 * We are making the following assumptions : 00144 * 00145 * @li The record terminator is only one character in length. 00146 * @li The field terminator is only one character in length. 00147 * @li The fields are enclosed by single characters, if any. 00148 * 00149 * @li The parser can handle documents where fields are always enclosed, not enclosed at all or optionally enclosed. 00150 * @li When fields are strictly all enclosed, there is an assumption that any enclosure characters within the field are escaped by placing a backslash in front of the enclosure character. 00151 * 00152 * The CSV files can be parsed in 3 modes. 00153 * @li (a) No enclosures 00154 * @li (b) Fields always enclosed. 00155 * @li (c) Fields optionally enclosed. 00156 * 00157 * For option (c) when the enclosure character is optional, if an enclosure character is spotted at either the beginning 00158 * or the end of the string, it is assumed that the field is enclosed. 00159 * 00160 * The csv_parser::init() method can accept a character array as the path to the CSV file. 00161 * Since it is overloaded, it can also accept a FILE pointer to a stream that is already open for reading. 00162 * 00163 * The set_enclosed_char() method accepts the field enclosure character as the first parameter and the enclosure mode as the second parameter which 00164 * controls how the text file is going to be parsed. 00165 * 00166 * @see csv_parser::set_enclosed_char() 00167 * @see enclosure_type_t 00168 * 00169 * @todo Add ability to parse files where fields/columns are terminated by strings instead of just one char. 00170 * @todo Add ability to set strings where lines start by. Currently lines do not have any starting char or string. 00171 * @todo Add ability to set strings where line end by. Currently lines can only end with a single char. 00172 * @todo Add ability to accept other escape characters besides the backslash character 0x5C. 00173 * @todo More support for improperly formatted CSV data files. 00174 * 00175 * @author Israel Ekpo <israel.ekpo@israelekpo.com> 00176 */ 00177 class csv_parser 00178 { 00179 00180 public : 00181 00182 /** 00183 * Class constructor 00184 * 00185 * This is the default constructor. 00186 * 00187 * All the internal attributes are initialized here 00188 * 00189 * @li The enclosure character is initialized to NULL 0x00. 00190 * @li The escape character is initialized to the backslash character 0x5C. 00191 * @li The field delimiter character is initialized to a comma 0x2C. 00192 * @li The record delimiter character is initialized to a new line character 0x0A. 00193 * 00194 * @li The lengths of all the above-mentioned fields are initialized to 0,1,1 and 1 respectively. 00195 * @li The number of records to ignore is set to zero. 00196 * @li The more_rows internal attribute is set to false. 00197 * @li The pointer to the CSV input file is initialized to NULL 00198 * @li The pointer to the buffer for the file name is also initialized to NULL 00199 */ 00200 csv_parser() : enclosed_char(0x00), escaped_char(0x5C), 00201 field_term_char(0x2C), line_term_char(0x0A), 00202 enclosed_length(0U), escaped_length(1U), 00203 field_term_length(1U), line_term_length(1U), 00204 ignore_num_lines(0U), record_count(0U), 00205 input_fp(NULL), input_filename(NULL), 00206 enclosure_type(ENCLOSURE_NONE), 00207 more_rows(false) 00208 { } 00209 00210 /** 00211 * Class destructor 00212 * 00213 * In the class destructor the file pointer to the input CSV file is closed and 00214 * the buffer to the input file name is also deallocated. 00215 * 00216 * @see csv_parser::input_fp 00217 * @see csv_parser::input_filename 00218 */ 00219 ~csv_parser() 00220 { 00221 CSV_PARSER_FREE_FILE_PTR(input_fp); 00222 00223 CSV_PARSER_FREE_BUFFER_PTR(input_filename); 00224 } 00225 00226 /** 00227 * Initializes the current object 00228 * 00229 * This init method accepts a pointer to the CSV file that has been opened for reading 00230 * 00231 * It also resets the file pointer to the beginning of the stream 00232 * 00233 * @overload bool init(FILE * input_file_pointer) 00234 * @param[in] input_file_pointer 00235 * @return bool Returns true on success and false on error. 00236 */ 00237 bool init(FILE * input_file_pointer); 00238 00239 /** 00240 * Initializes the current object 00241 * 00242 * @li This init method accepts a character array as the path to the csv file. 00243 * @li It sets the value of the csv_parser::input_filename property. 00244 * @li Then it creates a pointer to the csv_parser::input_fp property. 00245 * 00246 * @overload bool init(const char * input_filename) 00247 * @param[in] input_filename 00248 * @return bool Returns true on success and false on error. 00249 */ 00250 bool init(const char * input_filename); 00251 00252 /** 00253 * Defines the Field Enclosure character used in the Text File 00254 * 00255 * Setting this to NULL means that the enclosure character is optional. 00256 * 00257 * If the enclosure is optional, there could be fields that are enclosed, and fields that are not enclosed within the same line/record. 00258 * 00259 * @param[in] fields_enclosed_by The character used to enclose the fields. 00260 * @param[in] enclosure_mode How the CSV file should be parsed. 00261 * @return void 00262 */ 00263 void set_enclosed_char(char fields_enclosed_by, enclosure_type_t enclosure_mode); 00264 00265 /** 00266 * Defines the Field Delimiter character used in the text file 00267 * 00268 * @param[in] fields_terminated_by 00269 * @return void 00270 */ 00271 void set_field_term_char(char fields_terminated_by); 00272 00273 /** 00274 * Defines the Record Terminator character used in the text file 00275 * 00276 * @param[in] lines_terminated_by 00277 * @return void 00278 */ 00279 void set_line_term_char(char lines_terminated_by); 00280 00281 /** 00282 * Returns whether there is still more data 00283 * 00284 * This method returns a boolean value indicating whether or not there are 00285 * still more records to be extracted in the current file being parsed. 00286 * 00287 * Call this method to see if there are more rows to retrieve before invoking csv_parser::get_row() 00288 * 00289 * @see csv_parser::get_row() 00290 * @see csv_parser::more_rows 00291 * 00292 * @return bool Returns true if there are still more rows and false if there is not. 00293 */ 00294 bool has_more_rows(void) 00295 { 00296 return more_rows; 00297 } 00298 00299 /** 00300 * Defines the number of records to discard 00301 * 00302 * The number of records specified will be discarded during the parsing process. 00303 * 00304 * @see csv_parser::_skip_lines() 00305 * @see csv_parser::get_row() 00306 * @see csv_parser::has_more_rows() 00307 * 00308 * @param[in] lines_to_skip How many records should be skipped 00309 * @return void 00310 */ 00311 void set_skip_lines(unsigned int lines_to_skip) 00312 { 00313 ignore_num_lines = lines_to_skip; 00314 } 00315 00316 /** 00317 * Return the current row from the CSV file 00318 * 00319 * The row is returned as a vector of string objects. 00320 * 00321 * This method should be called only if csv_parser::has_more_rows() is true 00322 * 00323 * @see csv_parser::has_more_rows() 00324 * @see csv_parser::get_record_count() 00325 * @see csv_parser::reset_record_count() 00326 * @see csv_parser::more_rows 00327 * 00328 * @return csv_row A vector type containing an array of strings 00329 */ 00330 csv_row get_row(void); 00331 00332 /** 00333 * Returns the number of times the csv_parser::get_row() method has been invoked 00334 * 00335 * @see csv_parser::reset_record_count() 00336 * @return unsigned int The number of times the csv_parser::get_row() method has been invoked. 00337 */ 00338 unsigned int get_record_count(void) 00339 { 00340 return record_count; 00341 } 00342 00343 /** 00344 * Resets the record_count internal attribute to zero 00345 * 00346 * This may be used if the object is reused multiple times. 00347 * 00348 * @see csv_parser::record_count 00349 * @see csv_parser::get_record_count() 00350 * @return void 00351 */ 00352 void reset_record_count(void) 00353 { 00354 record_count = 0U; 00355 } 00356 00357 private : 00358 00359 /** 00360 * Ignores N records in the CSV file 00361 * 00362 * Where N is the value of the csv_parser::ignore_num_lines internal property. 00363 * 00364 * The number of lines skipped can be defined by csv_parser::set_skip_lines() 00365 * 00366 * @see csv_parser::set_skip_lines() 00367 * 00368 * @return void 00369 */ 00370 void _skip_lines(void); 00371 00372 /** 00373 * Reads a Single Line 00374 * 00375 * Reads a single record into the buffer passed by reference to the method 00376 * 00377 * @param[in,out] buffer A pointer to a character array for the current line. 00378 * @param[out] buffer_len A pointer to an integer storing the length of the buffer. 00379 * @return void 00380 */ 00381 void _read_single_line(char ** buffer, unsigned int * buffer_len); 00382 00383 /** 00384 * Extracts the fields without enclosures 00385 * 00386 * This is used when the enclosure character is not set 00387 * @param[out] row The vector of strings 00388 * @param[in] line The character array buffer containing the current record/line 00389 * @param[in] line_length The length of the buffer 00390 */ 00391 void _get_fields_without_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length); 00392 00393 /** 00394 * Extracts the fields with enclosures 00395 * 00396 * This is used when the enclosure character is set. 00397 * 00398 * @param[out] row The vector of strings 00399 * @param[in] line The character array buffer containing the current record/line 00400 * @param[in] line_length The length of the buffer 00401 */ 00402 void _get_fields_with_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length); 00403 00404 /** 00405 * Extracts the fields when enclosure is optional 00406 * 00407 * This is used when the enclosure character is optional 00408 * 00409 * Hence, there could be fields that use it, and fields that don't. 00410 * 00411 * @param[out] row The vector of strings 00412 * @param[in] line The character array buffer containing the current record/line 00413 * @param[in] line_length The length of the buffer 00414 */ 00415 void _get_fields_with_optional_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length); 00416 00417 protected : 00418 00419 /** 00420 * The enclosure character 00421 * 00422 * If present or used for a field it is assumed that both ends of the fields are wrapped. 00423 * 00424 * This is that single character used in the document to wrap the fields. 00425 * 00426 * @see csv_parser::_get_fields_without_enclosure() 00427 * @see csv_parser::_get_fields_with_enclosure() 00428 * @see csv_parser::_get_fields_with_optional_enclosure() 00429 * 00430 * @var enclosed_char 00431 */ 00432 char enclosed_char; 00433 00434 /** 00435 * The escape character 00436 * 00437 * For now the only valid escape character allowed is the backslash character 0x5C 00438 * 00439 * This is only important when the enclosure character is required or optional. 00440 * 00441 * This is the backslash character used to escape enclosure characters found within the fields. 00442 * 00443 * @see csv_parser::_get_fields_with_enclosure() 00444 * @see csv_parser::_get_fields_with_optional_enclosure() 00445 * @todo Update the code to accept other escape characters besides the backslash 00446 * 00447 * @var escaped_char 00448 */ 00449 char escaped_char; 00450 00451 /** 00452 * The field terminator 00453 * 00454 * This is the single character used to mark the end of a column in the text file. 00455 * 00456 * Common characters used include the comma, tab, and semi-colons. 00457 * 00458 * This is the single character used to separate fields within a record. 00459 * 00460 * @var field_term_char 00461 */ 00462 char field_term_char; 00463 00464 /** 00465 * The record terminator 00466 * 00467 * This is the single character used to mark the end of a record in the text file. 00468 * 00469 * The most popular one is the new line character however it is possible to use others as well. 00470 * 00471 * This is the single character used to mark the end of a record 00472 * 00473 * @see csv_parser::get_row() 00474 * 00475 * @var line_term_char 00476 */ 00477 char line_term_char; 00478 00479 /** 00480 * Enclosure length 00481 * 00482 * This is the length of the enclosure character 00483 * 00484 * @see csv_parser::csv_parser() 00485 * @see csv_parser::set_enclosed_char() 00486 * 00487 * @var enclosed_length 00488 */ 00489 unsigned int enclosed_length; 00490 00491 /** 00492 * The length of the escape character 00493 * 00494 * Right now this is really not being used. 00495 * 00496 * It may be used in future versions of the object. 00497 * 00498 * @todo Update the code to accept other escape characters besides the backslash 00499 * 00500 * @var escaped_length 00501 */ 00502 unsigned int escaped_length; 00503 00504 /** 00505 * Length of the field terminator 00506 * 00507 * For now this is not being used. It will be used in future versions of the object. 00508 * 00509 * @var field_term_length 00510 */ 00511 unsigned int field_term_length; 00512 00513 /** 00514 * Length of the record terminator 00515 * 00516 * For now this is not being used. It will be used in future versions of the object. 00517 * 00518 * @var line_term_length 00519 */ 00520 unsigned int line_term_length; 00521 00522 /** 00523 * Number of records to discard 00524 * 00525 * This variable controls how many records in the file are skipped before parsing begins. 00526 * 00527 * @see csv_parser::_skip_lines() 00528 * @see csv_parser::set_skip_lines() 00529 * 00530 * @var ignore_num_lines 00531 */ 00532 unsigned int ignore_num_lines; 00533 00534 /** 00535 * Number of times the get_row() method has been called 00536 * 00537 * @see csv_parser::get_row() 00538 * @var record_count 00539 */ 00540 unsigned int record_count; 00541 00542 /** 00543 * The CSV File Pointer 00544 * 00545 * This is the pointer to the CSV file 00546 * 00547 * @var input_fp 00548 */ 00549 FILE * input_fp; 00550 00551 /** 00552 * Buffer to input file name 00553 * 00554 * This buffer is used to store the name of the file that is being parsed 00555 * 00556 * @var input_filename 00557 */ 00558 char * input_filename; 00559 00560 /** 00561 * Mode in which the CSV file will be parsed 00562 * 00563 * The various values are explained below 00564 * 00565 * @li ENCLOSURE_NONE (1) means the CSV file does not use any enclosure characters for the fields 00566 * @li ENCLOSURE_REQUIRED (2) means the CSV file requires enclosure characters for all the fields 00567 * @li ENCLOSURE_OPTIONAL (3) means the use of enclosure characters for the fields is optional 00568 * 00569 * @see csv_parser::get_row() 00570 * @see csv_parser::_read_single_line() 00571 * @see csv_parser::_get_fields_without_enclosure() 00572 * @see csv_parser::_get_fields_with_enclosure() 00573 * @see csv_parser::_get_fields_with_optional_enclosure() 00574 * 00575 * @var enclosure_type 00576 */ 00577 enclosure_type_t enclosure_type; 00578 00579 /** 00580 * There are still more records to parse 00581 * 00582 * This boolean property is an internal indicator of whether there are still records in the 00583 * file to be parsed. 00584 * 00585 * @see csv_parser::has_more_rows() 00586 * @var more_rows 00587 */ 00588 bool more_rows; 00589 00590 }; /* class csv_parser */ 00591 00592 #endif /* CSV_PARSER_HPP_INCLUDED */
Generated on Fri Jul 22 2022 13:54:56 by 1.7.2