port of Israel Ekpo\'s CSV parser library

Dependents:   parser_sample IoTGateway_Basic

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers csv_parser.cpp Source File

csv_parser.cpp

00001 
00002 /* INCLUDING HEADER FILES */
00003 /*
00004 
00005 Copyright (c) 2008 - 2009, Israel Ekpo
00006 All rights reserved.
00007 
00008 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
00009 
00010     * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
00011     * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
00012     * Neither the name of Israel Ekpo nor the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission.
00013 
00014 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00015 
00016 
00017 */
00018 
00019 #include "csv_parser.h"
00020 
00021 
00022 /* BEGIN DEFINITION FOR PUBLIC METHODS */
00023 bool csv_parser::init (FILE * input_file_pointer)
00024 {
00025     input_fp = input_file_pointer;
00026 
00027     if (input_fp == NULL)
00028     {
00029         fprintf(stderr, "Fatal error : unable to open input file from file pointer\n");
00030 
00031         return false;
00032     }
00033 
00034     /* Resetting the internal pointer to the beginning of the stream */
00035     rewind(input_fp);
00036 
00037     more_rows = true;
00038 
00039     _skip_lines();
00040 
00041     return true;
00042 }
00043 
00044 bool csv_parser::init (const char * input_file)
00045 {
00046     const size_t filename_length = strlen(input_file);
00047 
00048     if (!filename_length)
00049     {
00050         fprintf(stderr, "Fatal error : invalid input file %s\n", input_file);
00051 
00052         return false;
00053     }
00054 
00055     input_filename = (char *) malloc(filename_length + 1);
00056 
00057     if (input_filename == NULL)
00058     {
00059         fprintf(stderr, "Fatal error : unable to allocate memory for file name buffer %s\n", input_file);
00060 
00061         return false;
00062     }
00063 
00064     memset(input_filename, 0, filename_length + 1);
00065 
00066     strcpy(input_filename, input_file);
00067 
00068     input_fp = fopen(input_file, "r");
00069 
00070     if (input_fp == NULL)
00071     {
00072         fprintf(stderr, "Fatal error : unable to open input file [%s]\n", input_file);
00073 
00074         CSV_PARSER_FREE_BUFFER_PTR(input_filename);
00075 
00076         return false;
00077     }
00078 
00079     more_rows = true;
00080 
00081     _skip_lines();
00082 
00083     return true;
00084 }
00085 
00086 void csv_parser::set_enclosed_char(char fields_enclosed_by, enclosure_type_t enclosure_mode)
00087 {
00088     if (fields_enclosed_by != 0)
00089     {
00090         enclosed_char   = fields_enclosed_by;
00091         enclosed_length = 1U;
00092         enclosure_type  = enclosure_mode;
00093     }
00094 }
00095 
00096 void csv_parser::set_field_term_char(char fields_terminated_by)
00097 {
00098     if (fields_terminated_by != 0)
00099     {
00100         field_term_char   = fields_terminated_by;
00101         field_term_length = 1U;
00102     }
00103 }
00104 
00105 void csv_parser::set_line_term_char(char lines_terminated_by)
00106 {
00107     if (lines_terminated_by != 0)
00108     {
00109         line_term_char   = lines_terminated_by;
00110         line_term_length = 1U;
00111     }
00112 }
00113 
00114 csv_row csv_parser::get_row(void)
00115 {
00116     csv_row current_row;
00117 
00118     /* This will store the length of the buffer */
00119     unsigned int line_length = 0U;
00120 
00121     /* Character array buffer for the current record */
00122     char * line = NULL;
00123 
00124     /* Grab one record */
00125     _read_single_line(&line, &line_length);
00126 
00127     /* Select the most suitable field extractor based on the enclosure length */
00128     switch(enclosure_type)
00129     {
00130         case ENCLOSURE_NONE :      /* The fields are not enclosed by any character */
00131             _get_fields_without_enclosure(&current_row, line, &line_length);
00132         break;
00133 
00134         case ENCLOSURE_REQUIRED : /* The fields are enclosed by a character */
00135             _get_fields_with_enclosure(&current_row, line, &line_length);
00136         break;
00137 
00138         case ENCLOSURE_OPTIONAL : /* The fields may or may not be enclosed */
00139             _get_fields_with_optional_enclosure(&current_row, line, &line_length);
00140         break;
00141 
00142         default :
00143             _get_fields_with_optional_enclosure(&current_row, line, &line_length);
00144         break;
00145     }
00146 
00147     /* Deallocate the current buffer */
00148     CSV_PARSER_FREE_BUFFER_PTR(line);
00149 
00150     /* Keeps track of how many times this has method has been called */
00151     record_count++;
00152 
00153     return current_row;
00154 }
00155 
00156 /* BEGIN DEFINITION FOR PROTECTED METHODS */
00157 
00158 
00159 /* BEGIN DEFINITION FOR PRIVATE METHODS */
00160 
00161 void csv_parser::_skip_lines(void)
00162 {
00163     /* Just in case the user accidentally sets ignore_num_lines to a negative number */
00164     unsigned int number_of_lines_to_ignore = abs((int) ignore_num_lines);
00165 
00166     while(has_more_rows() && number_of_lines_to_ignore)
00167     {
00168         const csv_row row = get_row();
00169 
00170         number_of_lines_to_ignore--;
00171     }
00172 
00173     record_count = 0U;
00174 }
00175 
00176 void csv_parser::_get_fields_without_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)
00177 {
00178     char * field = NULL;
00179 
00180     if (*line_length > 0)
00181     {
00182         field = (char *) malloc(*line_length);
00183 
00184         memset(field, 0, *line_length);
00185 
00186         register unsigned int field_start   = 0U;
00187         register unsigned int field_end     = 0U;
00188         register unsigned int char_pos         = 0U;
00189 
00190         while(char_pos < *line_length)
00191         {
00192             char curr_char = line[char_pos];
00193 
00194             if (curr_char == field_term_char)
00195             {
00196                 field_end = char_pos;
00197 
00198                 const char * field_starts_at = line + field_start;
00199 
00200                 /* Field width must exclude field delimiter characters */
00201                 const unsigned int field_width = field_end - field_start;
00202 
00203                 /* Copy exactly field_width bytes from field_starts_at to field */
00204                 memcpy(field, field_starts_at, field_width);
00205 
00206                 /* This must be a null-terminated character array */
00207                 field[field_width] = 0x00;
00208 
00209                 string field_string_obj = field;
00210 
00211                 row->push_back(field_string_obj);
00212 
00213                 /* This is the starting point of the next field */
00214                 field_start = char_pos + 1;
00215 
00216             } else if (curr_char == line_term_char)
00217             {
00218                 field_end = char_pos;
00219 
00220                 const char * field_starts_at = line + field_start;
00221 
00222                 /* Field width must exclude line terminating characters */
00223                 const unsigned int field_width = field_end - field_start;
00224 
00225                 /* Copy exactly field_width bytes from field_starts_at to field */
00226                 memcpy(field, field_starts_at, field_width);
00227 
00228                 /* This must be a null-terminated character array */
00229                 field[field_width] = 0x00;
00230 
00231                 string field_string_obj = field;
00232 
00233                 row->push_back(field_string_obj);
00234             }
00235 
00236             /* Move to the next character in the current line */
00237             char_pos++;
00238         }
00239 
00240         /* Deallocate memory for field buffer */
00241         CSV_PARSER_FREE_BUFFER_PTR(field);
00242     }
00243 }
00244 
00245 void csv_parser::_get_fields_with_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)
00246 {
00247     char * field = NULL;
00248 
00249     if (*line_length > 0)
00250     {
00251         field = (char *) malloc(*line_length);
00252 
00253         memset(field, 0, *line_length);
00254 
00255         register unsigned int current_state = 0U;
00256         register unsigned int field_start   = 0U;
00257         register unsigned int field_end     = 0U;
00258         register unsigned int char_pos         = 0U;
00259 
00260         while(char_pos < *line_length)
00261         {
00262             char curr_char = line[char_pos];
00263 
00264             if (curr_char == enclosed_char)
00265             {
00266                 current_state++;
00267 
00268                 /* Lets find out if the enclosure character encountered is
00269                  * a 'real' enclosure character or if it is an embedded character that
00270                  * has been escaped within the field.
00271                  */
00272                 register char previous_char = 0x00;
00273 
00274                 if (char_pos > 0U)
00275                 {
00276                     /* The escaped char will have to be the 2rd or later character. */
00277                     previous_char = line[char_pos - 1];
00278 
00279                     if (previous_char == escaped_char)
00280                     {
00281                         --current_state;
00282                     }
00283                 }
00284 
00285                 if (current_state == 1U && previous_char != escaped_char)
00286                 {
00287                     /* This marks the beginning of the column */
00288                     field_start = char_pos;
00289 
00290                 } else if (current_state == 2U)
00291                 {
00292                     /* We have found the end of the current field */
00293                     field_end = char_pos;
00294 
00295                     /* We do not need the enclosure characters */
00296                     const char * field_starts_at = line + field_start + 1U;
00297 
00298                     /* Field width must exclude beginning and ending enclosure characters */
00299                     const unsigned int field_width = field_end - field_start - 1U;
00300 
00301                     /* Copy exactly field_width bytes from field_starts_at to field */
00302                     memcpy(field, field_starts_at, field_width);
00303 
00304                     /* This must be a null-terminated character array */
00305                     field[field_width] = 0x00;
00306 
00307                     string field_string_obj = field;
00308 
00309                     row->push_back(field_string_obj);
00310 
00311                     /* Reset the state to zero value for the next field */
00312                     current_state = 0U;
00313                 }
00314             }
00315 
00316             /* Move to the next character in the current line */
00317             char_pos++;
00318         }
00319 
00320         /* If no enclosures were found in this line, the entire line becomes the only field. */
00321         if (0 == row->size())
00322         {
00323             string entire_line = line;
00324 
00325             row->push_back(entire_line);
00326 
00327         } else if (current_state == 1U)
00328         {
00329             /* The beginning enclosure character was found but
00330              * we could not locate the closing enclosure in the current line
00331              * So we need to copy the remainder of the line into the last field.
00332              */
00333 
00334             /* We do not need the starting enclosure character */
00335             const char * field_starts_at = line + field_start + 1U;
00336 
00337             /* Field width must exclude beginning characters */
00338             const unsigned int field_width = *line_length - field_start - 1U;
00339 
00340             /* Copy exactly field_width bytes from field_starts_at to field */
00341             memcpy(field, field_starts_at, field_width);
00342 
00343             /* This must be a null-terminated character array */
00344             field[field_width] = 0x00;
00345 
00346             string field_string_obj = field;
00347 
00348             row->push_back(field_string_obj);
00349         }
00350 
00351         /* Release the buffer for the field */
00352         CSV_PARSER_FREE_BUFFER_PTR(field);
00353     }
00354 }
00355 
00356 void csv_parser::_get_fields_with_optional_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)
00357 {
00358     char * field = NULL;
00359 
00360     /*
00361      * How to extract the fields, when the enclosure char is optional.
00362      *
00363      * This is very similar to parsing the document without enclosure but with the following conditions.
00364      *
00365      * If the beginning char is an enclosure character, adjust the starting position of the string by + 1.
00366      * If the ending char is an enclosure character, adjust the ending position by -1
00367      */
00368     if (*line_length > 0)
00369     {
00370         field = (char *) malloc(*line_length);
00371 
00372         memset(field, 0, *line_length);
00373 
00374         register unsigned int field_start   = 0U;
00375         register unsigned int field_end     = 0U;
00376         register unsigned int char_pos         = 0U;
00377 
00378         while(char_pos < *line_length)
00379         {
00380             char curr_char = line[char_pos];
00381 
00382             if (curr_char == field_term_char)
00383             {
00384                 field_end = char_pos;
00385 
00386                 const char * field_starts_at = line + field_start;
00387 
00388                 /* Field width must exclude field delimiter characters */
00389                 unsigned int field_width = field_end - field_start;
00390 
00391                 const char line_first_char = field_starts_at[0];
00392                 const char line_final_char = field_starts_at[field_width - 1];
00393 
00394                 /* If the enclosure char is found at either ends of the string */
00395                 unsigned int first_adjustment = (line_first_char == enclosed_char) ? 1U : 0U;
00396                 unsigned int final_adjustment = (line_final_char == enclosed_char) ? 2U : 0U;
00397 
00398                 /* We do not want to have any negative or zero field widths */
00399                 field_width = (field_width > 2U) ? (field_width - final_adjustment) : field_width;
00400 
00401                 /* Copy exactly field_width bytes from field_starts_at to field */
00402                 memcpy(field, field_starts_at + first_adjustment, field_width);
00403 
00404                 /* This must be a null-terminated character array */
00405                 field[field_width] = 0x00;
00406 
00407                 string field_string_obj = field;
00408 
00409                 row->push_back(field_string_obj);
00410 
00411                 /* This is the starting point of the next field */
00412                 field_start = char_pos + 1;
00413 
00414             } else if (curr_char == line_term_char)
00415             {
00416                 field_end = char_pos;
00417 
00418                 const char * field_starts_at = line + field_start;
00419 
00420                 /* Field width must exclude line terminating characters */
00421                 unsigned int field_width = field_end - field_start;
00422 
00423                 const char line_first_char = field_starts_at[0];
00424                 const char line_final_char = field_starts_at[field_width - 1];
00425 
00426                 /* If the enclosure char is found at either ends of the string */
00427                 unsigned int first_adjustment = (line_first_char == enclosed_char) ? 1U : 0U;
00428                 unsigned int final_adjustment = (line_final_char == enclosed_char) ? 2U : 0U;
00429 
00430                 /* We do not want to have any negative or zero field widths */
00431                 field_width = (field_width > 2U) ? (field_width - final_adjustment) : field_width;
00432 
00433                 /* Copy exactly field_width bytes from field_starts_at to field */
00434                 memcpy(field, field_starts_at + first_adjustment, field_width);
00435 
00436                 /* This must be a null-terminated character array */
00437                 field[field_width] = 0x00;
00438 
00439                 string field_string_obj = field;
00440 
00441                 row->push_back(field_string_obj);
00442             }
00443 
00444             /* Move to the next character in the current line */
00445             char_pos++;
00446         }
00447 
00448         /* Deallocate memory for field buffer */
00449         CSV_PARSER_FREE_BUFFER_PTR(field);
00450     }
00451 }
00452 
00453 void csv_parser::_read_single_line(char ** buffer, unsigned int * buffer_len)
00454 {
00455     long int original_pos = ftell(input_fp);
00456     long int current_pos  = original_pos;
00457 
00458     register int current_char = 0;
00459 
00460     /* Checking one character at a time until the end of a line is found */
00461     while(true)
00462     {
00463         current_char = fgetc(input_fp);
00464 
00465         if (current_char == EOF)
00466         {
00467             /* We have reached the end of the file */
00468             more_rows = false;
00469 
00470             break;
00471 
00472         } else if (current_char == line_term_char)
00473         {
00474             /* We have reached the end of the row */
00475             current_pos++;
00476 
00477             break;
00478 
00479         } else {
00480 
00481             current_pos++;
00482         }
00483     }
00484 
00485     /* Let's try to peek one character ahead to see if we are at the end of the file */
00486     if (more_rows)
00487     {
00488         current_char = fgetc(input_fp);
00489 
00490         more_rows = (current_char == EOF) ? false : true;
00491     }
00492 
00493     /* Find out how long this row is */
00494     const size_t length_of_row = current_pos - original_pos;
00495 
00496     if (length_of_row > 0)
00497     {
00498         *buffer_len = length_of_row * sizeof(char) + 1;
00499 
00500         *buffer = (char *) realloc(*buffer, *buffer_len);
00501 
00502         memset(*buffer, 0, *buffer_len);
00503 
00504         /* Reset the internal pointer to the original position */
00505         fseek(input_fp, original_pos, SEEK_SET);
00506 
00507         /* Copy the contents of the line into the buffer */
00508         fread(*buffer, 1, length_of_row, input_fp);
00509     }
00510 }