port of Israel Ekpo\'s CSV parser library
Dependents: parser_sample IoTGateway_Basic
csv_parser.cpp
00001 00002 /* INCLUDING HEADER FILES */ 00003 /* 00004 00005 Copyright (c) 2008 - 2009, Israel Ekpo 00006 All rights reserved. 00007 00008 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 00009 00010 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 00011 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 00012 * Neither the name of Israel Ekpo nor the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission. 00013 00014 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00015 00016 00017 */ 00018 00019 #include "csv_parser.h" 00020 00021 00022 /* BEGIN DEFINITION FOR PUBLIC METHODS */ 00023 bool csv_parser::init (FILE * input_file_pointer) 00024 { 00025 input_fp = input_file_pointer; 00026 00027 if (input_fp == NULL) 00028 { 00029 fprintf(stderr, "Fatal error : unable to open input file from file pointer\n"); 00030 00031 return false; 00032 } 00033 00034 /* Resetting the internal pointer to the beginning of the stream */ 00035 rewind(input_fp); 00036 00037 more_rows = true; 00038 00039 _skip_lines(); 00040 00041 return true; 00042 } 00043 00044 bool csv_parser::init (const char * input_file) 00045 { 00046 const size_t filename_length = strlen(input_file); 00047 00048 if (!filename_length) 00049 { 00050 fprintf(stderr, "Fatal error : invalid input file %s\n", input_file); 00051 00052 return false; 00053 } 00054 00055 input_filename = (char *) malloc(filename_length + 1); 00056 00057 if (input_filename == NULL) 00058 { 00059 fprintf(stderr, "Fatal error : unable to allocate memory for file name buffer %s\n", input_file); 00060 00061 return false; 00062 } 00063 00064 memset(input_filename, 0, filename_length + 1); 00065 00066 strcpy(input_filename, input_file); 00067 00068 input_fp = fopen(input_file, "r"); 00069 00070 if (input_fp == NULL) 00071 { 00072 fprintf(stderr, "Fatal error : unable to open input file [%s]\n", input_file); 00073 00074 CSV_PARSER_FREE_BUFFER_PTR(input_filename); 00075 00076 return false; 00077 } 00078 00079 more_rows = true; 00080 00081 _skip_lines(); 00082 00083 return true; 00084 } 00085 00086 void csv_parser::set_enclosed_char(char fields_enclosed_by, enclosure_type_t enclosure_mode) 00087 { 00088 if (fields_enclosed_by != 0) 00089 { 00090 enclosed_char = fields_enclosed_by; 00091 enclosed_length = 1U; 00092 enclosure_type = enclosure_mode; 00093 } 00094 } 00095 00096 void csv_parser::set_field_term_char(char fields_terminated_by) 00097 { 00098 if (fields_terminated_by != 0) 00099 { 00100 field_term_char = fields_terminated_by; 00101 field_term_length = 1U; 00102 } 00103 } 00104 00105 void csv_parser::set_line_term_char(char lines_terminated_by) 00106 { 00107 if (lines_terminated_by != 0) 00108 { 00109 line_term_char = lines_terminated_by; 00110 line_term_length = 1U; 00111 } 00112 } 00113 00114 csv_row csv_parser::get_row(void) 00115 { 00116 csv_row current_row; 00117 00118 /* This will store the length of the buffer */ 00119 unsigned int line_length = 0U; 00120 00121 /* Character array buffer for the current record */ 00122 char * line = NULL; 00123 00124 /* Grab one record */ 00125 _read_single_line(&line, &line_length); 00126 00127 /* Select the most suitable field extractor based on the enclosure length */ 00128 switch(enclosure_type) 00129 { 00130 case ENCLOSURE_NONE : /* The fields are not enclosed by any character */ 00131 _get_fields_without_enclosure(¤t_row, line, &line_length); 00132 break; 00133 00134 case ENCLOSURE_REQUIRED : /* The fields are enclosed by a character */ 00135 _get_fields_with_enclosure(¤t_row, line, &line_length); 00136 break; 00137 00138 case ENCLOSURE_OPTIONAL : /* The fields may or may not be enclosed */ 00139 _get_fields_with_optional_enclosure(¤t_row, line, &line_length); 00140 break; 00141 00142 default : 00143 _get_fields_with_optional_enclosure(¤t_row, line, &line_length); 00144 break; 00145 } 00146 00147 /* Deallocate the current buffer */ 00148 CSV_PARSER_FREE_BUFFER_PTR(line); 00149 00150 /* Keeps track of how many times this has method has been called */ 00151 record_count++; 00152 00153 return current_row; 00154 } 00155 00156 /* BEGIN DEFINITION FOR PROTECTED METHODS */ 00157 00158 00159 /* BEGIN DEFINITION FOR PRIVATE METHODS */ 00160 00161 void csv_parser::_skip_lines(void) 00162 { 00163 /* Just in case the user accidentally sets ignore_num_lines to a negative number */ 00164 unsigned int number_of_lines_to_ignore = abs((int) ignore_num_lines); 00165 00166 while(has_more_rows() && number_of_lines_to_ignore) 00167 { 00168 const csv_row row = get_row(); 00169 00170 number_of_lines_to_ignore--; 00171 } 00172 00173 record_count = 0U; 00174 } 00175 00176 void csv_parser::_get_fields_without_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length) 00177 { 00178 char * field = NULL; 00179 00180 if (*line_length > 0) 00181 { 00182 field = (char *) malloc(*line_length); 00183 00184 memset(field, 0, *line_length); 00185 00186 register unsigned int field_start = 0U; 00187 register unsigned int field_end = 0U; 00188 register unsigned int char_pos = 0U; 00189 00190 while(char_pos < *line_length) 00191 { 00192 char curr_char = line[char_pos]; 00193 00194 if (curr_char == field_term_char) 00195 { 00196 field_end = char_pos; 00197 00198 const char * field_starts_at = line + field_start; 00199 00200 /* Field width must exclude field delimiter characters */ 00201 const unsigned int field_width = field_end - field_start; 00202 00203 /* Copy exactly field_width bytes from field_starts_at to field */ 00204 memcpy(field, field_starts_at, field_width); 00205 00206 /* This must be a null-terminated character array */ 00207 field[field_width] = 0x00; 00208 00209 string field_string_obj = field; 00210 00211 row->push_back(field_string_obj); 00212 00213 /* This is the starting point of the next field */ 00214 field_start = char_pos + 1; 00215 00216 } else if (curr_char == line_term_char) 00217 { 00218 field_end = char_pos; 00219 00220 const char * field_starts_at = line + field_start; 00221 00222 /* Field width must exclude line terminating characters */ 00223 const unsigned int field_width = field_end - field_start; 00224 00225 /* Copy exactly field_width bytes from field_starts_at to field */ 00226 memcpy(field, field_starts_at, field_width); 00227 00228 /* This must be a null-terminated character array */ 00229 field[field_width] = 0x00; 00230 00231 string field_string_obj = field; 00232 00233 row->push_back(field_string_obj); 00234 } 00235 00236 /* Move to the next character in the current line */ 00237 char_pos++; 00238 } 00239 00240 /* Deallocate memory for field buffer */ 00241 CSV_PARSER_FREE_BUFFER_PTR(field); 00242 } 00243 } 00244 00245 void csv_parser::_get_fields_with_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length) 00246 { 00247 char * field = NULL; 00248 00249 if (*line_length > 0) 00250 { 00251 field = (char *) malloc(*line_length); 00252 00253 memset(field, 0, *line_length); 00254 00255 register unsigned int current_state = 0U; 00256 register unsigned int field_start = 0U; 00257 register unsigned int field_end = 0U; 00258 register unsigned int char_pos = 0U; 00259 00260 while(char_pos < *line_length) 00261 { 00262 char curr_char = line[char_pos]; 00263 00264 if (curr_char == enclosed_char) 00265 { 00266 current_state++; 00267 00268 /* Lets find out if the enclosure character encountered is 00269 * a 'real' enclosure character or if it is an embedded character that 00270 * has been escaped within the field. 00271 */ 00272 register char previous_char = 0x00; 00273 00274 if (char_pos > 0U) 00275 { 00276 /* The escaped char will have to be the 2rd or later character. */ 00277 previous_char = line[char_pos - 1]; 00278 00279 if (previous_char == escaped_char) 00280 { 00281 --current_state; 00282 } 00283 } 00284 00285 if (current_state == 1U && previous_char != escaped_char) 00286 { 00287 /* This marks the beginning of the column */ 00288 field_start = char_pos; 00289 00290 } else if (current_state == 2U) 00291 { 00292 /* We have found the end of the current field */ 00293 field_end = char_pos; 00294 00295 /* We do not need the enclosure characters */ 00296 const char * field_starts_at = line + field_start + 1U; 00297 00298 /* Field width must exclude beginning and ending enclosure characters */ 00299 const unsigned int field_width = field_end - field_start - 1U; 00300 00301 /* Copy exactly field_width bytes from field_starts_at to field */ 00302 memcpy(field, field_starts_at, field_width); 00303 00304 /* This must be a null-terminated character array */ 00305 field[field_width] = 0x00; 00306 00307 string field_string_obj = field; 00308 00309 row->push_back(field_string_obj); 00310 00311 /* Reset the state to zero value for the next field */ 00312 current_state = 0U; 00313 } 00314 } 00315 00316 /* Move to the next character in the current line */ 00317 char_pos++; 00318 } 00319 00320 /* If no enclosures were found in this line, the entire line becomes the only field. */ 00321 if (0 == row->size()) 00322 { 00323 string entire_line = line; 00324 00325 row->push_back(entire_line); 00326 00327 } else if (current_state == 1U) 00328 { 00329 /* The beginning enclosure character was found but 00330 * we could not locate the closing enclosure in the current line 00331 * So we need to copy the remainder of the line into the last field. 00332 */ 00333 00334 /* We do not need the starting enclosure character */ 00335 const char * field_starts_at = line + field_start + 1U; 00336 00337 /* Field width must exclude beginning characters */ 00338 const unsigned int field_width = *line_length - field_start - 1U; 00339 00340 /* Copy exactly field_width bytes from field_starts_at to field */ 00341 memcpy(field, field_starts_at, field_width); 00342 00343 /* This must be a null-terminated character array */ 00344 field[field_width] = 0x00; 00345 00346 string field_string_obj = field; 00347 00348 row->push_back(field_string_obj); 00349 } 00350 00351 /* Release the buffer for the field */ 00352 CSV_PARSER_FREE_BUFFER_PTR(field); 00353 } 00354 } 00355 00356 void csv_parser::_get_fields_with_optional_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length) 00357 { 00358 char * field = NULL; 00359 00360 /* 00361 * How to extract the fields, when the enclosure char is optional. 00362 * 00363 * This is very similar to parsing the document without enclosure but with the following conditions. 00364 * 00365 * If the beginning char is an enclosure character, adjust the starting position of the string by + 1. 00366 * If the ending char is an enclosure character, adjust the ending position by -1 00367 */ 00368 if (*line_length > 0) 00369 { 00370 field = (char *) malloc(*line_length); 00371 00372 memset(field, 0, *line_length); 00373 00374 register unsigned int field_start = 0U; 00375 register unsigned int field_end = 0U; 00376 register unsigned int char_pos = 0U; 00377 00378 while(char_pos < *line_length) 00379 { 00380 char curr_char = line[char_pos]; 00381 00382 if (curr_char == field_term_char) 00383 { 00384 field_end = char_pos; 00385 00386 const char * field_starts_at = line + field_start; 00387 00388 /* Field width must exclude field delimiter characters */ 00389 unsigned int field_width = field_end - field_start; 00390 00391 const char line_first_char = field_starts_at[0]; 00392 const char line_final_char = field_starts_at[field_width - 1]; 00393 00394 /* If the enclosure char is found at either ends of the string */ 00395 unsigned int first_adjustment = (line_first_char == enclosed_char) ? 1U : 0U; 00396 unsigned int final_adjustment = (line_final_char == enclosed_char) ? 2U : 0U; 00397 00398 /* We do not want to have any negative or zero field widths */ 00399 field_width = (field_width > 2U) ? (field_width - final_adjustment) : field_width; 00400 00401 /* Copy exactly field_width bytes from field_starts_at to field */ 00402 memcpy(field, field_starts_at + first_adjustment, field_width); 00403 00404 /* This must be a null-terminated character array */ 00405 field[field_width] = 0x00; 00406 00407 string field_string_obj = field; 00408 00409 row->push_back(field_string_obj); 00410 00411 /* This is the starting point of the next field */ 00412 field_start = char_pos + 1; 00413 00414 } else if (curr_char == line_term_char) 00415 { 00416 field_end = char_pos; 00417 00418 const char * field_starts_at = line + field_start; 00419 00420 /* Field width must exclude line terminating characters */ 00421 unsigned int field_width = field_end - field_start; 00422 00423 const char line_first_char = field_starts_at[0]; 00424 const char line_final_char = field_starts_at[field_width - 1]; 00425 00426 /* If the enclosure char is found at either ends of the string */ 00427 unsigned int first_adjustment = (line_first_char == enclosed_char) ? 1U : 0U; 00428 unsigned int final_adjustment = (line_final_char == enclosed_char) ? 2U : 0U; 00429 00430 /* We do not want to have any negative or zero field widths */ 00431 field_width = (field_width > 2U) ? (field_width - final_adjustment) : field_width; 00432 00433 /* Copy exactly field_width bytes from field_starts_at to field */ 00434 memcpy(field, field_starts_at + first_adjustment, field_width); 00435 00436 /* This must be a null-terminated character array */ 00437 field[field_width] = 0x00; 00438 00439 string field_string_obj = field; 00440 00441 row->push_back(field_string_obj); 00442 } 00443 00444 /* Move to the next character in the current line */ 00445 char_pos++; 00446 } 00447 00448 /* Deallocate memory for field buffer */ 00449 CSV_PARSER_FREE_BUFFER_PTR(field); 00450 } 00451 } 00452 00453 void csv_parser::_read_single_line(char ** buffer, unsigned int * buffer_len) 00454 { 00455 long int original_pos = ftell(input_fp); 00456 long int current_pos = original_pos; 00457 00458 register int current_char = 0; 00459 00460 /* Checking one character at a time until the end of a line is found */ 00461 while(true) 00462 { 00463 current_char = fgetc(input_fp); 00464 00465 if (current_char == EOF) 00466 { 00467 /* We have reached the end of the file */ 00468 more_rows = false; 00469 00470 break; 00471 00472 } else if (current_char == line_term_char) 00473 { 00474 /* We have reached the end of the row */ 00475 current_pos++; 00476 00477 break; 00478 00479 } else { 00480 00481 current_pos++; 00482 } 00483 } 00484 00485 /* Let's try to peek one character ahead to see if we are at the end of the file */ 00486 if (more_rows) 00487 { 00488 current_char = fgetc(input_fp); 00489 00490 more_rows = (current_char == EOF) ? false : true; 00491 } 00492 00493 /* Find out how long this row is */ 00494 const size_t length_of_row = current_pos - original_pos; 00495 00496 if (length_of_row > 0) 00497 { 00498 *buffer_len = length_of_row * sizeof(char) + 1; 00499 00500 *buffer = (char *) realloc(*buffer, *buffer_len); 00501 00502 memset(*buffer, 0, *buffer_len); 00503 00504 /* Reset the internal pointer to the original position */ 00505 fseek(input_fp, original_pos, SEEK_SET); 00506 00507 /* Copy the contents of the line into the buffer */ 00508 fread(*buffer, 1, length_of_row, input_fp); 00509 } 00510 }
Generated on Fri Jul 22 2022 13:54:56 by 1.7.2