In this article I will show a bit of code I wrote to parse a CSV file to a list of defined objects.
There are a lot of really good CSV readers out there and I don’t intend on adding one more to that mix, but I often do one-off custom data processing jobs where I don’t want to bring a third party package into my project. I want total transparency for auditing purposes.
In Part 2 I will look at evaluating my solution below, with a common library from NuGet.
Background
I had a CSV file with 266 fields, but my application only needed two of them. So I used that as an opportunity to play around and see what I could come up with. Yeah it could be seen as reinventing the wheel, but sometimes I just like to play (in my own time or course).
Overview
This code has 2 parts, the first is custom attributes and an extension method, and the second is the Parser. When I used this code, I kept it all together in one file.
The Code
Custom Attributes
These custom attributes are used for mapping the class property to the CSV field, i.e. when your CSV field doesn’t have a name or the name doesn’t match the class property.
Column Name and Number Attributes
public class CSVColumnName : System.Attribute
{
public string Value { get; protected set; }
public CSVColumnName(string value)
{
this.Value = value;
}
}
public class CSVColumnNumber : System.Attribute
{
public int Value { get; protected set; }
public CSVColumnNumber(int value)
{
this.Value = value;
}
}
Extension method for reading the custom attributes
public static class CSVExtensionMethods
{
public static string GetCSVColumnName(this PropertyInfo property)
{
CSVColumnName[] attributtes = property.GetCustomAttributes(typeof(CSVColumnName), false) as CSVColumnName[];
return attributtes.Length > 0 ? attributtes[0].Value : string.Empty;
}
public static int GetCSVColumnNumber(this PropertyInfo property)
{
CSVColumnNumber[] attributtes = property.GetCustomAttributes(typeof(CSVColumnNumber), false) as CSVColumnNumber[];
return attributtes.Length > 0 ? attributtes[0].Value : -1;
}
}
CSV Parser
public class CSVParser<t>
{
public bool ColumnNameCaseInsentive { get; set; } = false;
protected Dictionary<string, int> columnMapping = new Dictionary<string, int>();
public List<t> ParseFile(string filePath, bool firstRowHeader)
{
if (ColumnNameCaseInsentive)
columnMapping = new Dictionary<string, int>(StringComparer.InvariantCultureIgnoreCase);
else
columnMapping = new Dictionary<string, int>();
string[] rows = File.ReadAllLines(filePath);
if (!firstRowHeader) // then can't use column names
MapColumnsByIndex();
else
MapColumns(rows[0]);
if (columnMapping.Count == 0)
throw new Exception("No columns could not be matched with the class.");
return rows.Skip(firstRowHeader ? 1 : 0).Select(i => ParseRow(i)).ToList();
}
protected t ParseRow(string row)
{
// this pattern only looks for comma's outside of quotes
var pattern = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)";
string[] values = Regex.Split(row, pattern);
t newObject = (t)Activator.CreateInstance(typeof(t));
IList propertyInfos = newObject.GetType().GetProperties();
foreach (PropertyInfo property in propertyInfos)
{
if (columnMapping.ContainsKey(property.Name))
ParseValue(property, newObject, values[columnMapping[property.Name]]);
}
return newObject;
}
protected void ParseValue(PropertyInfo property, t newObject, string value)
{
if (property.PropertyType == typeof(string))
{
ParseStringValue(property, newObject, value);
return;
}
if (property.PropertyType == typeof(int))
{
ParseIntValue(property, newObject, value);
return;
}
if (property.PropertyType == typeof(bool))
{
ParseBoolValue(property, newObject, value);
return;
}
if (property.PropertyType == typeof(DateTime))
{
ParseDateTimeValue(property, newObject, value);
return;
}
// just give it a crack
property.SetValue(newObject, value);
}
protected void ParseBoolValue(PropertyInfo property, t newObject, string value)
{
if (bool.TryParse(value, out bool result))
{
property.SetValue(newObject, result);
}
else
{
// lets try something else
switch (value.ToLower())
{
case "yes":
case "1":
property.SetValue(newObject, true);
break;
case "no":
case "0":
property.SetValue(newObject, false);
break;
}
// no errors
}
}
protected void ParseStringValue(PropertyInfo property, t newObject, string value)
{
property.SetValue(newObject, value.ToString());
}
protected void ParseIntValue(PropertyInfo property, t newObject, string value)
{
if (int.TryParse(value, out int intValue))
{
property.SetValue(newObject, intValue);
}
else
{
// no errors
}
}
protected void ParseDateTimeValue(PropertyInfo property, t newObject, string value)
{
if (DateTime.TryParse(value, out DateTime date))
{
property.SetValue(newObject, date);
}
else
{
// no errors
}
}
protected void MapColumnsByIndex()
{
t newObject = (t)Activator.CreateInstance(typeof(t));
IList propertyInfos = newObject.GetType().GetProperties();
foreach (PropertyInfo property in propertyInfos)
{
int colIndex = property.GetCSVColumnNumber();
if (colIndex >= 0)
columnMapping.Add(property.Name, colIndex);
}
}
protected void MapColumns(string row)
{
t newObject = (t)Activator.CreateInstance(typeof(t));
IList propertyInfos = newObject.GetType().GetProperties();
string[] columns = row.Split(',');
foreach (PropertyInfo property in propertyInfos)
MapColumn(property, columns);
}
protected void MapColumn(PropertyInfo propInfo, string[] columns)
{
// first use the CSVColumnName attribute
string colName = propInfo.GetCSVColumnName();
if (string.IsNullOrEmpty(colName))
{ // have a crack at the column number
int colIndex = propInfo.GetCSVColumnNumber();
if (string.IsNullOrEmpty(colName) && colIndex > -1)
{
columnMapping.Add(propInfo.Name, colIndex);
return;
}
}
if (string.IsNullOrEmpty(colName))
colName = propInfo.Name; // Else use the property name
int index = -1;
if (ColumnNameCaseInsentive)
index = Array.FindIndex(columns, t => t.Equals(colName, StringComparison.InvariantCultureIgnoreCase));
else
index = Array.IndexOf(columns, colName);
if (index > -1)
columnMapping.Add(propInfo.Name, index);
}
}
Example
The CSV File
Create a file like below, which for testing purposes has different column names and different value types.
column1,column2,ID,column4,column5,column6
1.34,,12,2022-11-01T11:25:00,"Testing,Training",How to use MS Word
7.1,,13,2022-11-01T12:27:00,"Accounting",March Financials
The Object Class
This is the object we will convert the file to.
Note I have used the column name and number attributes to map the properties to the file.
public class FileObject
{
public int ID { get; set; }
[CSVColumnName("column4")]
public DateTime DateAdded { get; set; }
[CSVColumnNumber(4)]
public string Name { get; set; }
}
Parsing the CSV to the class
This is simply it, it returns a List<FileObject>
object.
string filepath = @"C:\temp\FilingObjects.csv";
var results = new CSVParser<FileObject>().ParseFile(filepath, true);
Issue with the Code
The only issue I have with the above code is that, when a text field is wrapped in a text qualifier (double quotes, like in column5 of the example), it leaves the double quotes in the value. This didn’t worry me at the time as the codes purpose was somewhat specific and it didn’t affect the output.
But if you wanted to fix that part you would need to remove the quotes with replacing the ParseStringValue
method with something like below. Although in doing so could severally affect performance.
protected void ParseStringValue(PropertyInfo property, t newObject, string value)
{
property.SetValue(newObject, Regex.Replace(value.ToString(), "^\"(.*)\"$", "$1"));
}