Javascript code to parse CSV data

Does someone have an idea on where I could find some javascript code to parse CSV data ?

Answers:

Answer

jQuery-CSV

It's a jquery plugin designed to work as an end-to-end solution for parsing CSV into Javascript data. It handles every single edge case presented in RFC 4180, as well as some that pop up for Excel/Google Spreadsheed exports (ie mostly involving null values) that the spec is missing.

Example:

track,artist,album,year

Dangerous,'Busta Rhymes','When Disaster Strikes',1997

// calling this
music = $.csv.toArrays(csv)

// outputs...
[
  ["track","artist","album","year"],
  ["Dangerous","Busta Rhymes","When Disaster Strikes","1997"]
]

console.log(music[1][2]) // outputs: 'When Disaster Strikes'

Update:

Oh yeah, I should also probably mention that it's completely configurable.

music = $.csv.toArrays(csv, {
  delimiter:"'", // sets a custom value delimiter character
  separator:';', // sets a custom field separator character
});

Update 2:

It now works with jQuery on Node.js too. So you have the option of doing either client-side or server-side parsing with the same lib.

Update 3:

Since Google Code shutdown, jquery-csv has been migrated to GitHub.

Disclaimer: I am also the author of jQuery-CSV.

Answer

I have an implementation as part of a spreadsheet project.

This code is not yet tested thoroughly, but anyone is welcome to use it.

As some of the answers noted though, your implementation can be much simpler if you actually have DSV or TSV file, as they disallow the use of the record and field separators in the values. CSV, on the other hand can actually have commas and newlines inside a field, which breaks most regex and split-based approaches.

var CSV = {
parse: function(csv, reviver) {
    reviver = reviver || function(r, c, v) { return v; };
    var chars = csv.split(''), c = 0, cc = chars.length, start, end, table = [], row;
    while (c < cc) {
        table.push(row = []);
        while (c < cc && '\r' !== chars[c] && '\n' !== chars[c]) {
            start = end = c;
            if ('"' === chars[c]){
                start = end = ++c;
                while (c < cc) {
                    if ('"' === chars[c]) {
                        if ('"' !== chars[c+1]) { break; }
                        else { chars[++c] = ''; } // unescape ""
                    }
                    end = ++c;
                }
                if ('"' === chars[c]) { ++c; }
                while (c < cc && '\r' !== chars[c] && '\n' !== chars[c] && ',' !== chars[c]) { ++c; }
            } else {
                while (c < cc && '\r' !== chars[c] && '\n' !== chars[c] && ',' !== chars[c]) { end = ++c; }
            }
            row.push(reviver(table.length-1, row.length, chars.slice(start, end).join('')));
            if (',' === chars[c]) { ++c; }
        }
        if ('\r' === chars[c]) { ++c; }
        if ('\n' === chars[c]) { ++c; }
    }
    return table;
},

stringify: function(table, replacer) {
    replacer = replacer || function(r, c, v) { return v; };
    var csv = '', c, cc, r, rr = table.length, cell;
    for (r = 0; r < rr; ++r) {
        if (r) { csv += '\r\n'; }
        for (c = 0, cc = table[r].length; c < cc; ++c) {
            if (c) { csv += ','; }
            cell = replacer(r, c, table[r][c]);
            if (/[,\r\n"]/.test(cell)) { cell = '"' + cell.replace(/"/g, '""') + '"'; }
            csv += (cell || 0 === cell) ? cell : '';
        }
    }
    return csv;
}
};
Answer

Here's an extremely simple CSV parser that handles quoted fields with commas, new lines, and escaped double quotation marks. There's no splitting or RegEx. It scans the input string 1-2 characters at a time and builds an array.

Test it at http://jsfiddle.net/vHKYH/.

function parseCSV(str) {
    var arr = [];
    var quote = false;  // true means we're inside a quoted field

    // iterate over each character, keep track of current row and column (of the returned array)
    for (var row = 0, col = 0, c = 0; c < str.length; c++) {
        var cc = str[c], nc = str[c+1];        // current character, next character
        arr[row] = arr[row] || [];             // create a new row if necessary
        arr[row][col] = arr[row][col] || '';   // create a new column (start with empty string) if necessary

        // If the current character is a quotation mark, and we're inside a
        // quoted field, and the next character is also a quotation mark,
        // add a quotation mark to the current column and skip the next character
        if (cc == '"' && quote && nc == '"') { arr[row][col] += cc; ++c; continue; }  

        // If it's just one quotation mark, begin/end quoted field
        if (cc == '"') { quote = !quote; continue; }

        // If it's a comma and we're not in a quoted field, move on to the next column
        if (cc == ',' && !quote) { ++col; continue; }

        // If it's a newline (CRLF) and we're not in a quoted field, skip the next character
        // and move on to the next row and move to column 0 of that new row
        if (cc == '\r' && nc == '\n' && !quote) { ++row; col = 0; ++c; continue; }

        // If it's a newline (LF or CR) and we're not in a quoted field,
        // move on to the next row and move to column 0 of that new row
        if (cc == '\n' && !quote) { ++row; col = 0; continue; }
        if (cc == '\r' && !quote) { ++row; col = 0; continue; }

        // Otherwise, append the current character to the current column
        arr[row][col] += cc;
    }
    return arr;
}
Answer

Here's my PEG(.js) grammar that seems to do ok at RFC 4180 (i.e. it handles the examples at http://en.wikipedia.org/wiki/Comma-separated_values):

start
  = [\n\r]* first:line rest:([\n\r]+ data:line { return data; })* [\n\r]* { rest.unshift(first); return rest; }

line
  = first:field rest:("," text:field { return text; })*
    & { return !!first || rest.length; } // ignore blank lines
    { rest.unshift(first); return rest; }

field
  = '"' text:char* '"' { return text.join(''); }
  / text:[^\n\r,]* { return text.join(''); }

char
  = '"' '"' { return '"'; }
  / [^"]

Try it out at http://jsfiddle.net/knvzk/10 or http://pegjs.majda.cz/online. Download the generated parser at https://gist.github.com/3362830.

Answer

csvToArray v1.3

A compact (645 bytes) but compliant function to convert a CSV string into a 2D array, conforming to the RFC4180 standard.

https://code.google.com/archive/p/csv-to-array/downloads

Common Usage: jQuery

 $.ajax({
        url: "test.csv",
        dataType: 'text',
        cache: false
 }).done(function(csvAsString){
        csvAsArray=csvAsString.csvToArray();
 });

Common usage: Javascript

csvAsArray = csvAsString.csvToArray();

Override field separator

csvAsArray = csvAsString.csvToArray("|");

Override record separator

csvAsArray = csvAsString.csvToArray("", "#");

Override Skip Header

csvAsArray = csvAsString.csvToArray("", "", 1);

Override all

csvAsArray = csvAsString.csvToArray("|", "#", 1);
Answer

Im not sure why I couldn't kirtans ex. to work for me. It seemed to be failing on empty fields or maybe fields with trailing commas...

This one seems to handle both.

I did not write the parser code, just a wrapper around the parser function to make this work for a file. see Attribution

    var Strings = {
        /**
         * Wrapped csv line parser
         * @param s string delimited csv string
         * @param sep separator override
         * @attribution : http://www.greywyvern.com/?post=258 (comments closed on blog :( )
         */
        parseCSV : function(s,sep) {
            // http://stackoverflow.com/questions/1155678/javascript-string-newline-character
            var universalNewline = /\r\n|\r|\n/g;
            var a = s.split(universalNewline);
            for(var i in a){
                for (var f = a[i].split(sep = sep || ","), x = f.length - 1, tl; x >= 0; x--) {
                    if (f[x].replace(/"\s+$/, '"').charAt(f[x].length - 1) == '"') {
                        if ((tl = f[x].replace(/^\s+"/, '"')).length > 1 && tl.charAt(0) == '"') {
                            f[x] = f[x].replace(/^\s*"|"\s*$/g, '').replace(/""/g, '"');
                          } else if (x) {
                        f.splice(x - 1, 2, [f[x - 1], f[x]].join(sep));
                      } else f = f.shift().split(sep).concat(f);
                    } else f[x].replace(/""/g, '"');
                  } a[i] = f;
        }
        return a;
        }
    }
Answer

Why not just use .split(',') ?

http://www.w3schools.com/jsref/jsref_split.asp

var str="How are you doing today?";
var n=str.split(" "); 
Answer

You can use the CSVToArray() function mentioned in this blog entry.

<script type="text/javascript">
    // ref: http://stackoverflow.com/a/1293163/2343
    // This will parse a delimited string into an array of
    // arrays. The default delimiter is the comma, but this
    // can be overriden in the second argument.
    function CSVToArray( strData, strDelimiter ){
        // Check to see if the delimiter is defined. If not,
        // then default to comma.
        strDelimiter = (strDelimiter || ",");

        // Create a regular expression to parse the CSV values.
        var objPattern = new RegExp(
            (
                // Delimiters.
                "(\\" + strDelimiter + "|\\r?\\n|\\r|^)" +

                // Quoted fields.
                "(?:\"([^\"]*(?:\"\"[^\"]*)*)\"|" +

                // Standard fields.
                "([^\"\\" + strDelimiter + "\\r\\n]*))"
            ),
            "gi"
            );


        // Create an array to hold our data. Give the array
        // a default empty first row.
        var arrData = [[]];

        // Create an array to hold our individual pattern
        // matching groups.
        var arrMatches = null;


        // Keep looping over the regular expression matches
        // until we can no longer find a match.
        while (arrMatches = objPattern.exec( strData )){

            // Get the delimiter that was found.
            var strMatchedDelimiter = arrMatches[ 1 ];

            // Check to see if the given delimiter has a length
            // (is not the start of string) and if it matches
            // field delimiter. If id does not, then we know
            // that this delimiter is a row delimiter.
            if (
                strMatchedDelimiter.length &&
                strMatchedDelimiter !== strDelimiter
                ){

                // Since we have reached a new row of data,
                // add an empty row to our data array.
                arrData.push( [] );

            }

            var strMatchedValue;

            // Now that we have our delimiter out of the way,
            // let's check to see which kind of value we
            // captured (quoted or unquoted).
            if (arrMatches[ 2 ]){

                // We found a quoted value. When we capture
                // this value, unescape any double quotes.
                strMatchedValue = arrMatches[ 2 ].replace(
                    new RegExp( "\"\"", "g" ),
                    "\""
                    );

            } else {

                // We found a non-quoted value.
                strMatchedValue = arrMatches[ 3 ];

            }


            // Now that we have our value string, let's add
            // it to the data array.
            arrData[ arrData.length - 1 ].push( strMatchedValue );
        }

        // Return the parsed data.
        return( arrData );
    }

</script>
Answer

Regular expressions to the rescue! These few lines of code handle properly quoted fields with embedded commas, quotes, and newlines based on the RFC 4180 standard.

function parseCsv(data, fieldSep, newLine) {
    fieldSep = fieldSep || ',';
    newLine = newLine || '\n';
    var nSep = '\x1D';
    var qSep = '\x1E';
    var cSep = '\x1F';
    var nSepRe = new RegExp(nSep, 'g');
    var qSepRe = new RegExp(qSep, 'g');
    var cSepRe = new RegExp(cSep, 'g');
    var fieldRe = new RegExp('(?<=(^|[' + fieldSep + '\\n]))"(|[\\s\\S]+?(?<![^"]"))"(?=($|[' + fieldSep + '\\n]))', 'g');
    var grid = [];
    data.replace(/\r/g, '').replace(/\n+$/, '').replace(fieldRe, function(match, p1, p2) {
        return p2.replace(/\n/g, nSep).replace(/""/g, qSep).replace(/,/g, cSep);
    }).split(/\n/).forEach(function(line) {
        var row = line.split(fieldSep).map(function(cell) {
            return cell.replace(nSepRe, newLine).replace(qSepRe, '"').replace(cSepRe, ',');
        });
        grid.push(row);
    });
    return grid;
}

const csv = 'A1,B1,C1\n"A ""2""","B, 2","C\n2"';
const separator = ',';      // field separator, default: ','
const newline = ' <br /> '; // newline representation in case a field contains newlines, default: '\n' 
var grid = parseCsv(csv, separator, newline);
// expected: [ [ 'A1', 'B1', 'C1' ], [ 'A "2"', 'B, 2', 'C <br /> 2' ] ]

Unless stated elsewhere, you don't need a finite state machine. The regular expression handles RFC 4180 properly thanks to positive lookbehind, negative lookbehind, and positive lookahead.

Clone/download code at https://github.com/peterthoeny/parse-csv-js

Answer

I have constructed this javascript script to parse a CSV in string to array object. I find it better to break down the whole CSV into lines, fields and process them accordingly. I think that it will make it easy for you to change the code to suit your need.

I hope it will help you. Thanks.

    //
    //
    // CSV to object
    //
    //

    const new_line_char = '\n';
    const field_separator_char = ',';

    function parse_csv(csv_str) {

        var result = [];

        let line_end_index_moved = false;
        let line_start_index = 0;
        let line_end_index = 0;
        let csr_index = 0;
        let cursor_val = csv_str[csr_index];
        let found_new_line_char = get_new_line_char(csv_str);
        let in_quote = false;

        // handle \r\n
        if (found_new_line_char == '\r\n') {
            csv_str = csv_str.split(found_new_line_char).join(new_line_char);
        }
        // handle last char is not \n
        if (csv_str[csv_str.length - 1] !== new_line_char) {
            csv_str += new_line_char;
        }

        while (csr_index < csv_str.length) {
            if (cursor_val === '"') {
                in_quote = !in_quote;
            } else if (cursor_val === new_line_char) {
                if (in_quote === false) {
                    if (line_end_index_moved && (line_start_index <= line_end_index)) {
                        result.push(parse_csv_line(csv_str.substring(line_start_index, line_end_index)));
                        line_start_index = csr_index + 1;
                    } // else: just ignore line_end_index has not moved or line has not been sliced for parsing the line
                } // else: just ignore because we are in quote
            }
            csr_index++;
            cursor_val = csv_str[csr_index];
            line_end_index = csr_index;
            line_end_index_moved = true;
        }

        // handle \r\n
        if (found_new_line_char == '\r\n') {
            let new_result = [];
            let curr_row;
            for (var i = 0; i < result.length; i++) {
                curr_row = [];
                for (var j = 0; j < result[i].length; j++) {
                    curr_row.push(result[i][j].split(new_line_char).join('\r\n'));
                }
                new_result.push(curr_row);
            }
            result = new_result;
        }

        return result;
    }

    function parse_csv_line(csv_line_str) {

        var result = [];

        // let field_end_index_moved = false;
        let field_start_index = 0;
        let field_end_index = 0;
        let csr_index = 0;
        let cursor_val = csv_line_str[csr_index];
        let in_quote = false;

        // Pretend that the last char is the separator_char to complete the loop
        csv_line_str += field_separator_char;

        while (csr_index < csv_line_str.length) {
            if (cursor_val === '"') {
                in_quote = !in_quote;
            } else if (cursor_val === field_separator_char) {
                if (in_quote === false) {
                    if (field_start_index <= field_end_index) {
                        result.push(parse_csv_field(csv_line_str.substring(field_start_index, field_end_index)));
                        field_start_index = csr_index + 1;
                    } // else: just ignore field_end_index has not moved or field has not been sliced for parsing the field
                } // else: just ignore because we are in quote
            }
            csr_index++;
            cursor_val = csv_line_str[csr_index];
            field_end_index = csr_index;
            field_end_index_moved = true;
        }

        return result;
    }

    function parse_csv_field(csv_field_str) {
        with_quote = (csv_field_str[0] === '"');

        if (with_quote) {
            csv_field_str = csv_field_str.substring(1, csv_field_str.length - 1); // remove the start and end quotes
            csv_field_str = csv_field_str.split('""').join('"'); // handle double quotes
        }

        return csv_field_str;
    }

    // initial method: check the first newline character only
    function get_new_line_char(csv_str) {
        if (csv_str.indexOf('\r\n') > -1) {
            return '\r\n';
        } else {
            return '\n'
        }
    }
Answer

here's my simple vanilla js:

let a = 'one,two,"three, but with a comma",four,"five, with ""quotes"" in it.."'
console.log(splitQuotes(a))

function splitQuotes(line) {
  if(line.indexOf('"') < 0) return line.split(',')
  let result = [], cell = '', quote = false;
  for(let i = 0; i < line.length; i++) {
    char = line[i]
    if(char == '"' && line[i+1] == '"') {
      cell += char
      i++
    } else if(char == '"') {
      quote = !quote;
    } else if(!quote && char == ',') {
      result.push(cell)
      cell = ''
    } else {
      cell += char
    }
    if ( i == line.length-1 && cell) {
      result.push(cell)
    }
  }
  return result
}
Answer

Here's another solution, this uses:

  • a coarse global regex for splitting the CSV string (which includes surrounding quotes and trailing commas)
  • fine grain regex for cleaning up the surrounding quotes and trailing commas
  • also, have type correction differentiating strings, numbers and boolean values

For the following input string:

"This is\, a value",Hello,4,-123,3.1415,'This is also\, possible',true

The code outputs:

[
  "This is, a value",
  "Hello",
  4,
  -123,
  3.1415,
  "This is also, possible",
  true
]

Here's my implementation of parseCSVLine() in a runnable code snippet:

function parseCSVLine(text) {
  return text.match( /\s*(\".*?\"|'.*?'|[^,]+)\s*(,|$)/g ).map( function (text) {
    let m;
    if (m = text.match(/^\s*\"(.*?)\"\s*,?$/)) return m[1]; // Double Quoted Text
    if (m = text.match(/^\s*'(.*?)'\s*,?$/)) return m[1]; // Single Quoted Text
    if (m = text.match(/^\s*(true|false)\s*,?$/)) return m[1] === "true"; // Boolean
    if (m = text.match(/^\s*((?:\+|\-)?\d+)\s*,?$/)) return parseInt(m[1]); // Integer Number
    if (m = text.match(/^\s*((?:\+|\-)?\d*\.\d*)\s*,?$/)) return parseFloat(m[1]); // Floating Number
    if (m = text.match(/^\s*(.*?)\s*,?$/)) return m[1]; // Unquoted Text
    return text;
  } );
}

let data = `"This is\, a value",Hello,4,-123,3.1415,'This is also\, possible',true`;
let obj = parseCSVLine(data);
console.log( JSON.stringify( obj, undefined, 2 ) );

Tags

Recent Questions

Top Questions

Home Tags Terms of Service Privacy Policy DMCA Contact Us

©2020 All rights reserved.