port of Israel Ekpo\'s CSV parser library

Dependents:   parser_sample IoTGateway_Basic

Committer:
hlipka
Date:
Mon Jan 24 23:08:37 2011 +0000
Revision:
0:7c9aa931c67c
initial version

Who changed what in which revision?

UserRevisionLine numberNew contents of line
hlipka 0:7c9aa931c67c 1
hlipka 0:7c9aa931c67c 2 /* INCLUDING HEADER FILES */
hlipka 0:7c9aa931c67c 3 /*
hlipka 0:7c9aa931c67c 4
hlipka 0:7c9aa931c67c 5 Copyright (c) 2008 - 2009, Israel Ekpo
hlipka 0:7c9aa931c67c 6 All rights reserved.
hlipka 0:7c9aa931c67c 7
hlipka 0:7c9aa931c67c 8 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
hlipka 0:7c9aa931c67c 9
hlipka 0:7c9aa931c67c 10 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
hlipka 0:7c9aa931c67c 11 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
hlipka 0:7c9aa931c67c 12 * Neither the name of Israel Ekpo nor the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission.
hlipka 0:7c9aa931c67c 13
hlipka 0:7c9aa931c67c 14 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
hlipka 0:7c9aa931c67c 15
hlipka 0:7c9aa931c67c 16
hlipka 0:7c9aa931c67c 17 */
hlipka 0:7c9aa931c67c 18
hlipka 0:7c9aa931c67c 19 #include "csv_parser.h"
hlipka 0:7c9aa931c67c 20
hlipka 0:7c9aa931c67c 21
hlipka 0:7c9aa931c67c 22 /* BEGIN DEFINITION FOR PUBLIC METHODS */
hlipka 0:7c9aa931c67c 23 bool csv_parser::init(FILE * input_file_pointer)
hlipka 0:7c9aa931c67c 24 {
hlipka 0:7c9aa931c67c 25 input_fp = input_file_pointer;
hlipka 0:7c9aa931c67c 26
hlipka 0:7c9aa931c67c 27 if (input_fp == NULL)
hlipka 0:7c9aa931c67c 28 {
hlipka 0:7c9aa931c67c 29 fprintf(stderr, "Fatal error : unable to open input file from file pointer\n");
hlipka 0:7c9aa931c67c 30
hlipka 0:7c9aa931c67c 31 return false;
hlipka 0:7c9aa931c67c 32 }
hlipka 0:7c9aa931c67c 33
hlipka 0:7c9aa931c67c 34 /* Resetting the internal pointer to the beginning of the stream */
hlipka 0:7c9aa931c67c 35 rewind(input_fp);
hlipka 0:7c9aa931c67c 36
hlipka 0:7c9aa931c67c 37 more_rows = true;
hlipka 0:7c9aa931c67c 38
hlipka 0:7c9aa931c67c 39 _skip_lines();
hlipka 0:7c9aa931c67c 40
hlipka 0:7c9aa931c67c 41 return true;
hlipka 0:7c9aa931c67c 42 }
hlipka 0:7c9aa931c67c 43
hlipka 0:7c9aa931c67c 44 bool csv_parser::init(const char * input_file)
hlipka 0:7c9aa931c67c 45 {
hlipka 0:7c9aa931c67c 46 const size_t filename_length = strlen(input_file);
hlipka 0:7c9aa931c67c 47
hlipka 0:7c9aa931c67c 48 if (!filename_length)
hlipka 0:7c9aa931c67c 49 {
hlipka 0:7c9aa931c67c 50 fprintf(stderr, "Fatal error : invalid input file %s\n", input_file);
hlipka 0:7c9aa931c67c 51
hlipka 0:7c9aa931c67c 52 return false;
hlipka 0:7c9aa931c67c 53 }
hlipka 0:7c9aa931c67c 54
hlipka 0:7c9aa931c67c 55 input_filename = (char *) malloc(filename_length + 1);
hlipka 0:7c9aa931c67c 56
hlipka 0:7c9aa931c67c 57 if (input_filename == NULL)
hlipka 0:7c9aa931c67c 58 {
hlipka 0:7c9aa931c67c 59 fprintf(stderr, "Fatal error : unable to allocate memory for file name buffer %s\n", input_file);
hlipka 0:7c9aa931c67c 60
hlipka 0:7c9aa931c67c 61 return false;
hlipka 0:7c9aa931c67c 62 }
hlipka 0:7c9aa931c67c 63
hlipka 0:7c9aa931c67c 64 memset(input_filename, 0, filename_length + 1);
hlipka 0:7c9aa931c67c 65
hlipka 0:7c9aa931c67c 66 strcpy(input_filename, input_file);
hlipka 0:7c9aa931c67c 67
hlipka 0:7c9aa931c67c 68 input_fp = fopen(input_file, "r");
hlipka 0:7c9aa931c67c 69
hlipka 0:7c9aa931c67c 70 if (input_fp == NULL)
hlipka 0:7c9aa931c67c 71 {
hlipka 0:7c9aa931c67c 72 fprintf(stderr, "Fatal error : unable to open input file [%s]\n", input_file);
hlipka 0:7c9aa931c67c 73
hlipka 0:7c9aa931c67c 74 CSV_PARSER_FREE_BUFFER_PTR(input_filename);
hlipka 0:7c9aa931c67c 75
hlipka 0:7c9aa931c67c 76 return false;
hlipka 0:7c9aa931c67c 77 }
hlipka 0:7c9aa931c67c 78
hlipka 0:7c9aa931c67c 79 more_rows = true;
hlipka 0:7c9aa931c67c 80
hlipka 0:7c9aa931c67c 81 _skip_lines();
hlipka 0:7c9aa931c67c 82
hlipka 0:7c9aa931c67c 83 return true;
hlipka 0:7c9aa931c67c 84 }
hlipka 0:7c9aa931c67c 85
hlipka 0:7c9aa931c67c 86 void csv_parser::set_enclosed_char(char fields_enclosed_by, enclosure_type_t enclosure_mode)
hlipka 0:7c9aa931c67c 87 {
hlipka 0:7c9aa931c67c 88 if (fields_enclosed_by != 0)
hlipka 0:7c9aa931c67c 89 {
hlipka 0:7c9aa931c67c 90 enclosed_char = fields_enclosed_by;
hlipka 0:7c9aa931c67c 91 enclosed_length = 1U;
hlipka 0:7c9aa931c67c 92 enclosure_type = enclosure_mode;
hlipka 0:7c9aa931c67c 93 }
hlipka 0:7c9aa931c67c 94 }
hlipka 0:7c9aa931c67c 95
hlipka 0:7c9aa931c67c 96 void csv_parser::set_field_term_char(char fields_terminated_by)
hlipka 0:7c9aa931c67c 97 {
hlipka 0:7c9aa931c67c 98 if (fields_terminated_by != 0)
hlipka 0:7c9aa931c67c 99 {
hlipka 0:7c9aa931c67c 100 field_term_char = fields_terminated_by;
hlipka 0:7c9aa931c67c 101 field_term_length = 1U;
hlipka 0:7c9aa931c67c 102 }
hlipka 0:7c9aa931c67c 103 }
hlipka 0:7c9aa931c67c 104
hlipka 0:7c9aa931c67c 105 void csv_parser::set_line_term_char(char lines_terminated_by)
hlipka 0:7c9aa931c67c 106 {
hlipka 0:7c9aa931c67c 107 if (lines_terminated_by != 0)
hlipka 0:7c9aa931c67c 108 {
hlipka 0:7c9aa931c67c 109 line_term_char = lines_terminated_by;
hlipka 0:7c9aa931c67c 110 line_term_length = 1U;
hlipka 0:7c9aa931c67c 111 }
hlipka 0:7c9aa931c67c 112 }
hlipka 0:7c9aa931c67c 113
hlipka 0:7c9aa931c67c 114 csv_row csv_parser::get_row(void)
hlipka 0:7c9aa931c67c 115 {
hlipka 0:7c9aa931c67c 116 csv_row current_row;
hlipka 0:7c9aa931c67c 117
hlipka 0:7c9aa931c67c 118 /* This will store the length of the buffer */
hlipka 0:7c9aa931c67c 119 unsigned int line_length = 0U;
hlipka 0:7c9aa931c67c 120
hlipka 0:7c9aa931c67c 121 /* Character array buffer for the current record */
hlipka 0:7c9aa931c67c 122 char * line = NULL;
hlipka 0:7c9aa931c67c 123
hlipka 0:7c9aa931c67c 124 /* Grab one record */
hlipka 0:7c9aa931c67c 125 _read_single_line(&line, &line_length);
hlipka 0:7c9aa931c67c 126
hlipka 0:7c9aa931c67c 127 /* Select the most suitable field extractor based on the enclosure length */
hlipka 0:7c9aa931c67c 128 switch(enclosure_type)
hlipka 0:7c9aa931c67c 129 {
hlipka 0:7c9aa931c67c 130 case ENCLOSURE_NONE : /* The fields are not enclosed by any character */
hlipka 0:7c9aa931c67c 131 _get_fields_without_enclosure(&current_row, line, &line_length);
hlipka 0:7c9aa931c67c 132 break;
hlipka 0:7c9aa931c67c 133
hlipka 0:7c9aa931c67c 134 case ENCLOSURE_REQUIRED : /* The fields are enclosed by a character */
hlipka 0:7c9aa931c67c 135 _get_fields_with_enclosure(&current_row, line, &line_length);
hlipka 0:7c9aa931c67c 136 break;
hlipka 0:7c9aa931c67c 137
hlipka 0:7c9aa931c67c 138 case ENCLOSURE_OPTIONAL : /* The fields may or may not be enclosed */
hlipka 0:7c9aa931c67c 139 _get_fields_with_optional_enclosure(&current_row, line, &line_length);
hlipka 0:7c9aa931c67c 140 break;
hlipka 0:7c9aa931c67c 141
hlipka 0:7c9aa931c67c 142 default :
hlipka 0:7c9aa931c67c 143 _get_fields_with_optional_enclosure(&current_row, line, &line_length);
hlipka 0:7c9aa931c67c 144 break;
hlipka 0:7c9aa931c67c 145 }
hlipka 0:7c9aa931c67c 146
hlipka 0:7c9aa931c67c 147 /* Deallocate the current buffer */
hlipka 0:7c9aa931c67c 148 CSV_PARSER_FREE_BUFFER_PTR(line);
hlipka 0:7c9aa931c67c 149
hlipka 0:7c9aa931c67c 150 /* Keeps track of how many times this has method has been called */
hlipka 0:7c9aa931c67c 151 record_count++;
hlipka 0:7c9aa931c67c 152
hlipka 0:7c9aa931c67c 153 return current_row;
hlipka 0:7c9aa931c67c 154 }
hlipka 0:7c9aa931c67c 155
hlipka 0:7c9aa931c67c 156 /* BEGIN DEFINITION FOR PROTECTED METHODS */
hlipka 0:7c9aa931c67c 157
hlipka 0:7c9aa931c67c 158
hlipka 0:7c9aa931c67c 159 /* BEGIN DEFINITION FOR PRIVATE METHODS */
hlipka 0:7c9aa931c67c 160
hlipka 0:7c9aa931c67c 161 void csv_parser::_skip_lines(void)
hlipka 0:7c9aa931c67c 162 {
hlipka 0:7c9aa931c67c 163 /* Just in case the user accidentally sets ignore_num_lines to a negative number */
hlipka 0:7c9aa931c67c 164 unsigned int number_of_lines_to_ignore = abs((int) ignore_num_lines);
hlipka 0:7c9aa931c67c 165
hlipka 0:7c9aa931c67c 166 while(has_more_rows() && number_of_lines_to_ignore)
hlipka 0:7c9aa931c67c 167 {
hlipka 0:7c9aa931c67c 168 const csv_row row = get_row();
hlipka 0:7c9aa931c67c 169
hlipka 0:7c9aa931c67c 170 number_of_lines_to_ignore--;
hlipka 0:7c9aa931c67c 171 }
hlipka 0:7c9aa931c67c 172
hlipka 0:7c9aa931c67c 173 record_count = 0U;
hlipka 0:7c9aa931c67c 174 }
hlipka 0:7c9aa931c67c 175
hlipka 0:7c9aa931c67c 176 void csv_parser::_get_fields_without_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)
hlipka 0:7c9aa931c67c 177 {
hlipka 0:7c9aa931c67c 178 char * field = NULL;
hlipka 0:7c9aa931c67c 179
hlipka 0:7c9aa931c67c 180 if (*line_length > 0)
hlipka 0:7c9aa931c67c 181 {
hlipka 0:7c9aa931c67c 182 field = (char *) malloc(*line_length);
hlipka 0:7c9aa931c67c 183
hlipka 0:7c9aa931c67c 184 memset(field, 0, *line_length);
hlipka 0:7c9aa931c67c 185
hlipka 0:7c9aa931c67c 186 register unsigned int field_start = 0U;
hlipka 0:7c9aa931c67c 187 register unsigned int field_end = 0U;
hlipka 0:7c9aa931c67c 188 register unsigned int char_pos = 0U;
hlipka 0:7c9aa931c67c 189
hlipka 0:7c9aa931c67c 190 while(char_pos < *line_length)
hlipka 0:7c9aa931c67c 191 {
hlipka 0:7c9aa931c67c 192 char curr_char = line[char_pos];
hlipka 0:7c9aa931c67c 193
hlipka 0:7c9aa931c67c 194 if (curr_char == field_term_char)
hlipka 0:7c9aa931c67c 195 {
hlipka 0:7c9aa931c67c 196 field_end = char_pos;
hlipka 0:7c9aa931c67c 197
hlipka 0:7c9aa931c67c 198 const char * field_starts_at = line + field_start;
hlipka 0:7c9aa931c67c 199
hlipka 0:7c9aa931c67c 200 /* Field width must exclude field delimiter characters */
hlipka 0:7c9aa931c67c 201 const unsigned int field_width = field_end - field_start;
hlipka 0:7c9aa931c67c 202
hlipka 0:7c9aa931c67c 203 /* Copy exactly field_width bytes from field_starts_at to field */
hlipka 0:7c9aa931c67c 204 memcpy(field, field_starts_at, field_width);
hlipka 0:7c9aa931c67c 205
hlipka 0:7c9aa931c67c 206 /* This must be a null-terminated character array */
hlipka 0:7c9aa931c67c 207 field[field_width] = 0x00;
hlipka 0:7c9aa931c67c 208
hlipka 0:7c9aa931c67c 209 string field_string_obj = field;
hlipka 0:7c9aa931c67c 210
hlipka 0:7c9aa931c67c 211 row->push_back(field_string_obj);
hlipka 0:7c9aa931c67c 212
hlipka 0:7c9aa931c67c 213 /* This is the starting point of the next field */
hlipka 0:7c9aa931c67c 214 field_start = char_pos + 1;
hlipka 0:7c9aa931c67c 215
hlipka 0:7c9aa931c67c 216 } else if (curr_char == line_term_char)
hlipka 0:7c9aa931c67c 217 {
hlipka 0:7c9aa931c67c 218 field_end = char_pos;
hlipka 0:7c9aa931c67c 219
hlipka 0:7c9aa931c67c 220 const char * field_starts_at = line + field_start;
hlipka 0:7c9aa931c67c 221
hlipka 0:7c9aa931c67c 222 /* Field width must exclude line terminating characters */
hlipka 0:7c9aa931c67c 223 const unsigned int field_width = field_end - field_start;
hlipka 0:7c9aa931c67c 224
hlipka 0:7c9aa931c67c 225 /* Copy exactly field_width bytes from field_starts_at to field */
hlipka 0:7c9aa931c67c 226 memcpy(field, field_starts_at, field_width);
hlipka 0:7c9aa931c67c 227
hlipka 0:7c9aa931c67c 228 /* This must be a null-terminated character array */
hlipka 0:7c9aa931c67c 229 field[field_width] = 0x00;
hlipka 0:7c9aa931c67c 230
hlipka 0:7c9aa931c67c 231 string field_string_obj = field;
hlipka 0:7c9aa931c67c 232
hlipka 0:7c9aa931c67c 233 row->push_back(field_string_obj);
hlipka 0:7c9aa931c67c 234 }
hlipka 0:7c9aa931c67c 235
hlipka 0:7c9aa931c67c 236 /* Move to the next character in the current line */
hlipka 0:7c9aa931c67c 237 char_pos++;
hlipka 0:7c9aa931c67c 238 }
hlipka 0:7c9aa931c67c 239
hlipka 0:7c9aa931c67c 240 /* Deallocate memory for field buffer */
hlipka 0:7c9aa931c67c 241 CSV_PARSER_FREE_BUFFER_PTR(field);
hlipka 0:7c9aa931c67c 242 }
hlipka 0:7c9aa931c67c 243 }
hlipka 0:7c9aa931c67c 244
hlipka 0:7c9aa931c67c 245 void csv_parser::_get_fields_with_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)
hlipka 0:7c9aa931c67c 246 {
hlipka 0:7c9aa931c67c 247 char * field = NULL;
hlipka 0:7c9aa931c67c 248
hlipka 0:7c9aa931c67c 249 if (*line_length > 0)
hlipka 0:7c9aa931c67c 250 {
hlipka 0:7c9aa931c67c 251 field = (char *) malloc(*line_length);
hlipka 0:7c9aa931c67c 252
hlipka 0:7c9aa931c67c 253 memset(field, 0, *line_length);
hlipka 0:7c9aa931c67c 254
hlipka 0:7c9aa931c67c 255 register unsigned int current_state = 0U;
hlipka 0:7c9aa931c67c 256 register unsigned int field_start = 0U;
hlipka 0:7c9aa931c67c 257 register unsigned int field_end = 0U;
hlipka 0:7c9aa931c67c 258 register unsigned int char_pos = 0U;
hlipka 0:7c9aa931c67c 259
hlipka 0:7c9aa931c67c 260 while(char_pos < *line_length)
hlipka 0:7c9aa931c67c 261 {
hlipka 0:7c9aa931c67c 262 char curr_char = line[char_pos];
hlipka 0:7c9aa931c67c 263
hlipka 0:7c9aa931c67c 264 if (curr_char == enclosed_char)
hlipka 0:7c9aa931c67c 265 {
hlipka 0:7c9aa931c67c 266 current_state++;
hlipka 0:7c9aa931c67c 267
hlipka 0:7c9aa931c67c 268 /* Lets find out if the enclosure character encountered is
hlipka 0:7c9aa931c67c 269 * a 'real' enclosure character or if it is an embedded character that
hlipka 0:7c9aa931c67c 270 * has been escaped within the field.
hlipka 0:7c9aa931c67c 271 */
hlipka 0:7c9aa931c67c 272 register char previous_char = 0x00;
hlipka 0:7c9aa931c67c 273
hlipka 0:7c9aa931c67c 274 if (char_pos > 0U)
hlipka 0:7c9aa931c67c 275 {
hlipka 0:7c9aa931c67c 276 /* The escaped char will have to be the 2rd or later character. */
hlipka 0:7c9aa931c67c 277 previous_char = line[char_pos - 1];
hlipka 0:7c9aa931c67c 278
hlipka 0:7c9aa931c67c 279 if (previous_char == escaped_char)
hlipka 0:7c9aa931c67c 280 {
hlipka 0:7c9aa931c67c 281 --current_state;
hlipka 0:7c9aa931c67c 282 }
hlipka 0:7c9aa931c67c 283 }
hlipka 0:7c9aa931c67c 284
hlipka 0:7c9aa931c67c 285 if (current_state == 1U && previous_char != escaped_char)
hlipka 0:7c9aa931c67c 286 {
hlipka 0:7c9aa931c67c 287 /* This marks the beginning of the column */
hlipka 0:7c9aa931c67c 288 field_start = char_pos;
hlipka 0:7c9aa931c67c 289
hlipka 0:7c9aa931c67c 290 } else if (current_state == 2U)
hlipka 0:7c9aa931c67c 291 {
hlipka 0:7c9aa931c67c 292 /* We have found the end of the current field */
hlipka 0:7c9aa931c67c 293 field_end = char_pos;
hlipka 0:7c9aa931c67c 294
hlipka 0:7c9aa931c67c 295 /* We do not need the enclosure characters */
hlipka 0:7c9aa931c67c 296 const char * field_starts_at = line + field_start + 1U;
hlipka 0:7c9aa931c67c 297
hlipka 0:7c9aa931c67c 298 /* Field width must exclude beginning and ending enclosure characters */
hlipka 0:7c9aa931c67c 299 const unsigned int field_width = field_end - field_start - 1U;
hlipka 0:7c9aa931c67c 300
hlipka 0:7c9aa931c67c 301 /* Copy exactly field_width bytes from field_starts_at to field */
hlipka 0:7c9aa931c67c 302 memcpy(field, field_starts_at, field_width);
hlipka 0:7c9aa931c67c 303
hlipka 0:7c9aa931c67c 304 /* This must be a null-terminated character array */
hlipka 0:7c9aa931c67c 305 field[field_width] = 0x00;
hlipka 0:7c9aa931c67c 306
hlipka 0:7c9aa931c67c 307 string field_string_obj = field;
hlipka 0:7c9aa931c67c 308
hlipka 0:7c9aa931c67c 309 row->push_back(field_string_obj);
hlipka 0:7c9aa931c67c 310
hlipka 0:7c9aa931c67c 311 /* Reset the state to zero value for the next field */
hlipka 0:7c9aa931c67c 312 current_state = 0U;
hlipka 0:7c9aa931c67c 313 }
hlipka 0:7c9aa931c67c 314 }
hlipka 0:7c9aa931c67c 315
hlipka 0:7c9aa931c67c 316 /* Move to the next character in the current line */
hlipka 0:7c9aa931c67c 317 char_pos++;
hlipka 0:7c9aa931c67c 318 }
hlipka 0:7c9aa931c67c 319
hlipka 0:7c9aa931c67c 320 /* If no enclosures were found in this line, the entire line becomes the only field. */
hlipka 0:7c9aa931c67c 321 if (0 == row->size())
hlipka 0:7c9aa931c67c 322 {
hlipka 0:7c9aa931c67c 323 string entire_line = line;
hlipka 0:7c9aa931c67c 324
hlipka 0:7c9aa931c67c 325 row->push_back(entire_line);
hlipka 0:7c9aa931c67c 326
hlipka 0:7c9aa931c67c 327 } else if (current_state == 1U)
hlipka 0:7c9aa931c67c 328 {
hlipka 0:7c9aa931c67c 329 /* The beginning enclosure character was found but
hlipka 0:7c9aa931c67c 330 * we could not locate the closing enclosure in the current line
hlipka 0:7c9aa931c67c 331 * So we need to copy the remainder of the line into the last field.
hlipka 0:7c9aa931c67c 332 */
hlipka 0:7c9aa931c67c 333
hlipka 0:7c9aa931c67c 334 /* We do not need the starting enclosure character */
hlipka 0:7c9aa931c67c 335 const char * field_starts_at = line + field_start + 1U;
hlipka 0:7c9aa931c67c 336
hlipka 0:7c9aa931c67c 337 /* Field width must exclude beginning characters */
hlipka 0:7c9aa931c67c 338 const unsigned int field_width = *line_length - field_start - 1U;
hlipka 0:7c9aa931c67c 339
hlipka 0:7c9aa931c67c 340 /* Copy exactly field_width bytes from field_starts_at to field */
hlipka 0:7c9aa931c67c 341 memcpy(field, field_starts_at, field_width);
hlipka 0:7c9aa931c67c 342
hlipka 0:7c9aa931c67c 343 /* This must be a null-terminated character array */
hlipka 0:7c9aa931c67c 344 field[field_width] = 0x00;
hlipka 0:7c9aa931c67c 345
hlipka 0:7c9aa931c67c 346 string field_string_obj = field;
hlipka 0:7c9aa931c67c 347
hlipka 0:7c9aa931c67c 348 row->push_back(field_string_obj);
hlipka 0:7c9aa931c67c 349 }
hlipka 0:7c9aa931c67c 350
hlipka 0:7c9aa931c67c 351 /* Release the buffer for the field */
hlipka 0:7c9aa931c67c 352 CSV_PARSER_FREE_BUFFER_PTR(field);
hlipka 0:7c9aa931c67c 353 }
hlipka 0:7c9aa931c67c 354 }
hlipka 0:7c9aa931c67c 355
hlipka 0:7c9aa931c67c 356 void csv_parser::_get_fields_with_optional_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)
hlipka 0:7c9aa931c67c 357 {
hlipka 0:7c9aa931c67c 358 char * field = NULL;
hlipka 0:7c9aa931c67c 359
hlipka 0:7c9aa931c67c 360 /*
hlipka 0:7c9aa931c67c 361 * How to extract the fields, when the enclosure char is optional.
hlipka 0:7c9aa931c67c 362 *
hlipka 0:7c9aa931c67c 363 * This is very similar to parsing the document without enclosure but with the following conditions.
hlipka 0:7c9aa931c67c 364 *
hlipka 0:7c9aa931c67c 365 * If the beginning char is an enclosure character, adjust the starting position of the string by + 1.
hlipka 0:7c9aa931c67c 366 * If the ending char is an enclosure character, adjust the ending position by -1
hlipka 0:7c9aa931c67c 367 */
hlipka 0:7c9aa931c67c 368 if (*line_length > 0)
hlipka 0:7c9aa931c67c 369 {
hlipka 0:7c9aa931c67c 370 field = (char *) malloc(*line_length);
hlipka 0:7c9aa931c67c 371
hlipka 0:7c9aa931c67c 372 memset(field, 0, *line_length);
hlipka 0:7c9aa931c67c 373
hlipka 0:7c9aa931c67c 374 register unsigned int field_start = 0U;
hlipka 0:7c9aa931c67c 375 register unsigned int field_end = 0U;
hlipka 0:7c9aa931c67c 376 register unsigned int char_pos = 0U;
hlipka 0:7c9aa931c67c 377
hlipka 0:7c9aa931c67c 378 while(char_pos < *line_length)
hlipka 0:7c9aa931c67c 379 {
hlipka 0:7c9aa931c67c 380 char curr_char = line[char_pos];
hlipka 0:7c9aa931c67c 381
hlipka 0:7c9aa931c67c 382 if (curr_char == field_term_char)
hlipka 0:7c9aa931c67c 383 {
hlipka 0:7c9aa931c67c 384 field_end = char_pos;
hlipka 0:7c9aa931c67c 385
hlipka 0:7c9aa931c67c 386 const char * field_starts_at = line + field_start;
hlipka 0:7c9aa931c67c 387
hlipka 0:7c9aa931c67c 388 /* Field width must exclude field delimiter characters */
hlipka 0:7c9aa931c67c 389 unsigned int field_width = field_end - field_start;
hlipka 0:7c9aa931c67c 390
hlipka 0:7c9aa931c67c 391 const char line_first_char = field_starts_at[0];
hlipka 0:7c9aa931c67c 392 const char line_final_char = field_starts_at[field_width - 1];
hlipka 0:7c9aa931c67c 393
hlipka 0:7c9aa931c67c 394 /* If the enclosure char is found at either ends of the string */
hlipka 0:7c9aa931c67c 395 unsigned int first_adjustment = (line_first_char == enclosed_char) ? 1U : 0U;
hlipka 0:7c9aa931c67c 396 unsigned int final_adjustment = (line_final_char == enclosed_char) ? 2U : 0U;
hlipka 0:7c9aa931c67c 397
hlipka 0:7c9aa931c67c 398 /* We do not want to have any negative or zero field widths */
hlipka 0:7c9aa931c67c 399 field_width = (field_width > 2U) ? (field_width - final_adjustment) : field_width;
hlipka 0:7c9aa931c67c 400
hlipka 0:7c9aa931c67c 401 /* Copy exactly field_width bytes from field_starts_at to field */
hlipka 0:7c9aa931c67c 402 memcpy(field, field_starts_at + first_adjustment, field_width);
hlipka 0:7c9aa931c67c 403
hlipka 0:7c9aa931c67c 404 /* This must be a null-terminated character array */
hlipka 0:7c9aa931c67c 405 field[field_width] = 0x00;
hlipka 0:7c9aa931c67c 406
hlipka 0:7c9aa931c67c 407 string field_string_obj = field;
hlipka 0:7c9aa931c67c 408
hlipka 0:7c9aa931c67c 409 row->push_back(field_string_obj);
hlipka 0:7c9aa931c67c 410
hlipka 0:7c9aa931c67c 411 /* This is the starting point of the next field */
hlipka 0:7c9aa931c67c 412 field_start = char_pos + 1;
hlipka 0:7c9aa931c67c 413
hlipka 0:7c9aa931c67c 414 } else if (curr_char == line_term_char)
hlipka 0:7c9aa931c67c 415 {
hlipka 0:7c9aa931c67c 416 field_end = char_pos;
hlipka 0:7c9aa931c67c 417
hlipka 0:7c9aa931c67c 418 const char * field_starts_at = line + field_start;
hlipka 0:7c9aa931c67c 419
hlipka 0:7c9aa931c67c 420 /* Field width must exclude line terminating characters */
hlipka 0:7c9aa931c67c 421 unsigned int field_width = field_end - field_start;
hlipka 0:7c9aa931c67c 422
hlipka 0:7c9aa931c67c 423 const char line_first_char = field_starts_at[0];
hlipka 0:7c9aa931c67c 424 const char line_final_char = field_starts_at[field_width - 1];
hlipka 0:7c9aa931c67c 425
hlipka 0:7c9aa931c67c 426 /* If the enclosure char is found at either ends of the string */
hlipka 0:7c9aa931c67c 427 unsigned int first_adjustment = (line_first_char == enclosed_char) ? 1U : 0U;
hlipka 0:7c9aa931c67c 428 unsigned int final_adjustment = (line_final_char == enclosed_char) ? 2U : 0U;
hlipka 0:7c9aa931c67c 429
hlipka 0:7c9aa931c67c 430 /* We do not want to have any negative or zero field widths */
hlipka 0:7c9aa931c67c 431 field_width = (field_width > 2U) ? (field_width - final_adjustment) : field_width;
hlipka 0:7c9aa931c67c 432
hlipka 0:7c9aa931c67c 433 /* Copy exactly field_width bytes from field_starts_at to field */
hlipka 0:7c9aa931c67c 434 memcpy(field, field_starts_at + first_adjustment, field_width);
hlipka 0:7c9aa931c67c 435
hlipka 0:7c9aa931c67c 436 /* This must be a null-terminated character array */
hlipka 0:7c9aa931c67c 437 field[field_width] = 0x00;
hlipka 0:7c9aa931c67c 438
hlipka 0:7c9aa931c67c 439 string field_string_obj = field;
hlipka 0:7c9aa931c67c 440
hlipka 0:7c9aa931c67c 441 row->push_back(field_string_obj);
hlipka 0:7c9aa931c67c 442 }
hlipka 0:7c9aa931c67c 443
hlipka 0:7c9aa931c67c 444 /* Move to the next character in the current line */
hlipka 0:7c9aa931c67c 445 char_pos++;
hlipka 0:7c9aa931c67c 446 }
hlipka 0:7c9aa931c67c 447
hlipka 0:7c9aa931c67c 448 /* Deallocate memory for field buffer */
hlipka 0:7c9aa931c67c 449 CSV_PARSER_FREE_BUFFER_PTR(field);
hlipka 0:7c9aa931c67c 450 }
hlipka 0:7c9aa931c67c 451 }
hlipka 0:7c9aa931c67c 452
hlipka 0:7c9aa931c67c 453 void csv_parser::_read_single_line(char ** buffer, unsigned int * buffer_len)
hlipka 0:7c9aa931c67c 454 {
hlipka 0:7c9aa931c67c 455 long int original_pos = ftell(input_fp);
hlipka 0:7c9aa931c67c 456 long int current_pos = original_pos;
hlipka 0:7c9aa931c67c 457
hlipka 0:7c9aa931c67c 458 register int current_char = 0;
hlipka 0:7c9aa931c67c 459
hlipka 0:7c9aa931c67c 460 /* Checking one character at a time until the end of a line is found */
hlipka 0:7c9aa931c67c 461 while(true)
hlipka 0:7c9aa931c67c 462 {
hlipka 0:7c9aa931c67c 463 current_char = fgetc(input_fp);
hlipka 0:7c9aa931c67c 464
hlipka 0:7c9aa931c67c 465 if (current_char == EOF)
hlipka 0:7c9aa931c67c 466 {
hlipka 0:7c9aa931c67c 467 /* We have reached the end of the file */
hlipka 0:7c9aa931c67c 468 more_rows = false;
hlipka 0:7c9aa931c67c 469
hlipka 0:7c9aa931c67c 470 break;
hlipka 0:7c9aa931c67c 471
hlipka 0:7c9aa931c67c 472 } else if (current_char == line_term_char)
hlipka 0:7c9aa931c67c 473 {
hlipka 0:7c9aa931c67c 474 /* We have reached the end of the row */
hlipka 0:7c9aa931c67c 475 current_pos++;
hlipka 0:7c9aa931c67c 476
hlipka 0:7c9aa931c67c 477 break;
hlipka 0:7c9aa931c67c 478
hlipka 0:7c9aa931c67c 479 } else {
hlipka 0:7c9aa931c67c 480
hlipka 0:7c9aa931c67c 481 current_pos++;
hlipka 0:7c9aa931c67c 482 }
hlipka 0:7c9aa931c67c 483 }
hlipka 0:7c9aa931c67c 484
hlipka 0:7c9aa931c67c 485 /* Let's try to peek one character ahead to see if we are at the end of the file */
hlipka 0:7c9aa931c67c 486 if (more_rows)
hlipka 0:7c9aa931c67c 487 {
hlipka 0:7c9aa931c67c 488 current_char = fgetc(input_fp);
hlipka 0:7c9aa931c67c 489
hlipka 0:7c9aa931c67c 490 more_rows = (current_char == EOF) ? false : true;
hlipka 0:7c9aa931c67c 491 }
hlipka 0:7c9aa931c67c 492
hlipka 0:7c9aa931c67c 493 /* Find out how long this row is */
hlipka 0:7c9aa931c67c 494 const size_t length_of_row = current_pos - original_pos;
hlipka 0:7c9aa931c67c 495
hlipka 0:7c9aa931c67c 496 if (length_of_row > 0)
hlipka 0:7c9aa931c67c 497 {
hlipka 0:7c9aa931c67c 498 *buffer_len = length_of_row * sizeof(char) + 1;
hlipka 0:7c9aa931c67c 499
hlipka 0:7c9aa931c67c 500 *buffer = (char *) realloc(*buffer, *buffer_len);
hlipka 0:7c9aa931c67c 501
hlipka 0:7c9aa931c67c 502 memset(*buffer, 0, *buffer_len);
hlipka 0:7c9aa931c67c 503
hlipka 0:7c9aa931c67c 504 /* Reset the internal pointer to the original position */
hlipka 0:7c9aa931c67c 505 fseek(input_fp, original_pos, SEEK_SET);
hlipka 0:7c9aa931c67c 506
hlipka 0:7c9aa931c67c 507 /* Copy the contents of the line into the buffer */
hlipka 0:7c9aa931c67c 508 fread(*buffer, 1, length_of_row, input_fp);
hlipka 0:7c9aa931c67c 509 }
hlipka 0:7c9aa931c67c 510 }