CSVReader.java
/**
* *****************************************************************************
* Copyright 2013 SEMOSS.ORG
*
* This file is part of SEMOSS.
*
* SEMOSS is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SEMOSS is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* SEMOSS. If not, see <http://www.gnu.org/licenses/>.
* ****************************************************************************
*/
package com.ostrichemulators.semtool.poi.main;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.vocabulary.XMLSchema;
import org.supercsv.io.CsvMapReader;
import org.supercsv.prefs.CsvPreference;
import java.io.FileNotFoundException;
import java.util.Properties;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.model.impl.ValueFactoryImpl;
/**
* Loading data into SEMOSS using comma separated value (CSV) files
*/
public class CSVReader implements ImportFileReader {
private static final Logger logger = Logger.getLogger( CSVReader.class );
private final static String START_ROW = "START_ROW";
private final static String END_ROW = "END_ROW";
private final static String RELATION = "RELATION";
private final static String NODE_PROP = "NODE_PROP";
private final static String RELATION_PROP = "RELATION_PROP";
private static final Pattern RELATION_PAT = Pattern.compile( "^(.*)[@](.*)[@](.*)$" );
private static final Pattern NODEPROP_PAT = Pattern.compile( "^([^%]+)[%](.*)$" );
private static final Map<String, URI> datatypes = new HashMap<>();
private CsvMapReader mapReader;
private String[] header;
private final List<String> relationArrayList = new ArrayList<>();
private final Map<String, List<String>> nodePropArrayList = new HashMap<>();
private final List<String> relPropArrayList = new ArrayList<>();
private File propFile;
private final Properties rdfMap = new Properties();
private boolean lsInMem = false;
public CSVReader() {
}
public CSVReader( File control ) {
propFile = control;
}
@Override
public void keepLoadInMemory( boolean b ) {
lsInMem = b;
}
/**
* Loads the prop file for the CSV file
*
* @param fileName Absolute path to the prop file specified in the last column
* of the CSV file
*
* @throws IOException
* @throws FileNotFoundException
*/
protected void setRdfMapFromFile( File fileName ) throws FileNotFoundException, IOException {
Properties rdfPropMap = new Properties();
rdfPropMap.load( new FileReader( fileName ) );
for ( String name : rdfPropMap.stringPropertyNames() ) {
rdfMap.put( name, rdfPropMap.getProperty( name ) );
}
}
@Override
public ImportMetadata getMetadata( File file ) throws IOException, ImportValidationException {
return new ImportMetadata(); // no metadata for CSVs
}
@Override
public ImportData readOneFile( File file ) throws IOException, ImportValidationException {
ImportData data = new ImportData();
ImportMetadata im = data.getMetadata();
im.setSourceOfData( new URIImpl( file.toURI().toString() ) );
im.setLegacyMode( true );
try ( Reader rdr = new BufferedReader( new FileReader( file ) ) ) {
mapReader = new CsvMapReader( rdr, CsvPreference.STANDARD_PREFERENCE );
File propfile = propCSVFile( mapReader );
setRdfMapFromFile( propfile ); // will throw an IOException if missing file
createProcessors();
processConceptRelationURIs( data );
processNodePropURIs( data );
processRelationships( data );
}
finally {
if ( null != mapReader ) {
mapReader.close();
}
}
return data;
}
/**
* Matches user inputed column type in prop file to the specific variable type
* name within Java SuperCSV API
*/
public void createProcessors() {
// Columns in prop file that are NON_OPTIMAL must contain a value
Map<String, URI> dtlkp = new HashMap<>();
dtlkp.put( "Double", XMLSchema.DOUBLE );
dtlkp.put( "Int", XMLSchema.INT );
dtlkp.put( "Integer", XMLSchema.INTEGER );
dtlkp.put( "Float", XMLSchema.FLOAT );
dtlkp.put( "String", XMLSchema.STRING );
for ( int col = 0; col < header.length; col++ ) {
// find the type for each column
String type = rdfMap.getProperty( Integer.toString( col + 1 ), null );
// we have some sort of datatype to worry about, so keep track of it
if ( dtlkp.containsKey( type ) ) {
datatypes.put( header[col], dtlkp.get( type ) );
}
}
}
public void processRelationships( ImportData data ) throws IOException {
Map<String, LoadingSheetData> rels = new HashMap<>();
Map<String, LoadingSheetData> nodes = new HashMap<>();
for ( LoadingSheetData r : data.getSheets() ) {
if ( r.isRel() ) {
rels.put( r.getName(), r );
}
else {
nodes.put( r.getName(), r );
}
}
//start count at 1 just row 1 is the header
int count = 1;
int startRow = Integer.parseInt( rdfMap.getProperty( START_ROW, "2" ) );
while ( count < startRow && mapReader.read( header ) != null ) {
count++;
logger.debug( "Skipping line: " + count );
}
// get all the relation
// max row predetermined value
// overwrite this value if user specified the max rows to load
int maxRows = Integer.parseInt( rdfMap.getProperty( END_ROW, "10000" ) );
// only start from the maxRow - the startRow
// added -1 is because of index nature
// the earlier rows should already have been skipped
Map<String, String> jcrMap;
//while ( null != ( jcrMap = mapReader.read( header, processors ) )
// && count < maxRows ) {
while ( ++count < maxRows
&& null != ( jcrMap = mapReader.read( header ) ) ) {
// logger.debug( "Process line: " + count );
for ( Map.Entry<String, List<String>> en : nodePropArrayList.entrySet() ) {
String nodetype = en.getKey();
Collection<String> valuesForProp = en.getValue();
String sbjinstance = createInstanceValue( nodetype, jcrMap );
LoadingSheetData nlsd = nodes.get( nodetype );
Map<String, Value> props = new HashMap<>();
for ( String propValColumn : valuesForProp ) {
Value v = createObject( propValColumn, jcrMap );
if ( null != v ) {
props.put( propValColumn, v );
}
}
nlsd.add( sbjinstance, props );
}
// process all relationships in row
for ( String relation : relationArrayList ) {
Matcher m = RELATION_PAT.matcher( relation );
if ( !m.matches() ) {
logger.error( "can't find previously-found match (?)" );
break; // don't expect to ever get here
}
LoadingSheetData rlsd = rels.get( relation );
// get the subject and object for triple (the two indexes)
String sub = m.group( 1 );
// String predicate = m.group( 2 );
String obj = m.group( 3 );
String sbjinstance = createInstanceValue( sub, jcrMap );
String objinstance = createInstanceValue( obj, jcrMap );
rlsd.add( sbjinstance, objinstance, new HashMap<>() );
// FIXME: need to worry about relationship properties
}
}
}
private void processConceptRelationURIs( ImportData data ) {
// get the list of relationships from the prop file
Map<String, LoadingSheetData> rels = new HashMap<>();
if ( null != rdfMap.getProperty( RELATION ) ) {
String relationNames = rdfMap.getProperty( RELATION );
ValueFactory vf = new ValueFactoryImpl();
relationArrayList.clear();
for ( String relation : relationNames.split( ";" ) ) {
Matcher m = RELATION_PAT.matcher( relation );
if ( !m.matches() ) {
logger.warn( "skipping unparseable relationship definition: " + relation );
break;
}
relationArrayList.add( relation );
logger.debug( "Loading relation " + relation );
// get the subject and object for triple (the two indexes)
String sub = m.group( 1 );
String predicate = m.group( 2 );
String obj = m.group( 3 );
String subjectLabel = processAutoConcat( sub );
String objectLabel = processAutoConcat( obj );
// String name, String sType, String oType,String relname
LoadingSheetData rlsd = LoadingSheetData.relsheet( relation, subjectLabel,
objectLabel, predicate, lsInMem );
data.add( rlsd );
rels.put( relation, rlsd );
}
}
}
public void processNodePropURIs( ImportData data ) {
Map<String, LoadingSheetData> nodes = new HashMap<>();
if ( null != rdfMap.getProperty( NODE_PROP ) ) {
nodePropArrayList.clear();
String nodePropNames = rdfMap.getProperty( NODE_PROP );
for ( String nt : nodePropNames.split( ";" ) ) {
Matcher m = NODEPROP_PAT.matcher( nt );
if ( !m.matches() ) {
break;
}
logger.debug( "Loading Node Prop " + nt );
// get the subject and object for triple (the two indexes)
String sub = m.group( 1 );
String prop = m.group( 2 );
if ( !nodePropArrayList.containsKey( sub ) ) {
nodePropArrayList.put( sub, new ArrayList<>() );
}
List<String> propnames = Arrays.asList( prop.split( "%" ) );
nodePropArrayList.get( sub ).addAll( propnames );
String subjectLabel = processAutoConcat( sub );
if ( !nodes.containsKey( sub ) ) {
LoadingSheetData nlsd = LoadingSheetData.nodesheet( sub, subjectLabel );
nodes.put( sub, nlsd );
data.add( nlsd );
}
LoadingSheetData nlsd = nodes.get( sub );
for ( String pname : propnames ) {
nlsd.addProperty( pname, datatypes.get( pname ) );
}
}
}
}
/**
* Change the name of nodes that are concatenations of multiple CSV columns
* Example: changes the string "Cat+Dog" into "CatDog"
*
* @param input String name of the node that is a concatenation
*
* @return String name of the node removing the "+" to indicate a
* concatenation
*/
public String processAutoConcat( String input ) {
return input.replaceAll( "\\+", "" );
}
/**
* Determine if the node is a concatenation of multiple columns in the CSV
* file
*
* @param input String containing the name of the node
*
* @return true when the node is a concatenation
*/
public boolean isProperConcatHeader( String input ) {
boolean ret = true;
List<String> headerList = Arrays.asList( header );
for ( String split1 : input.split( "\\+" ) ) {
if ( !headerList.contains( split1 ) ) {
ret = false;
break;
}
}
return ret;
}
/**
* Constructs the node instance name
*
* @param subject String containing the node type name
* @param jcrMap Map containing the data in the CSV file
*
* @return retString String containing the instance level name
*/
public String createInstanceValue( String subject, Map<String, String> jcrMap ) {
String retString = "";
// if node is a concatenation
if ( subject.contains( "+" ) ) {
String elements[] = subject.split( "\\+" );
for ( String subjectElement : elements ) {
if ( jcrMap.containsKey( subjectElement ) && jcrMap.get( subjectElement ) != null ) {
String value = jcrMap.get( subjectElement );
retString = retString + value + "-";
}
else {
retString = retString + "null-";
}
}
// a - will show up at the end of this and we need to get rid of that
if ( !retString.equals( "" ) ) {
retString = retString.substring( 0, retString.length() - 1 );
}
}
else {
if ( jcrMap.containsKey( subject ) && jcrMap.get( subject ) != null ) {
retString = jcrMap.get( subject );
}
}
return retString;
}
private Value createObject( String object, Map<String, String> jcrMap ) {
ValueFactory vf = new ValueFactoryImpl();
// need to do the class vs. object magic
if ( object.contains( "+" ) ) {
StringBuilder strBuilder = new StringBuilder();
String[] objList = object.split( "\\+" );
for ( String objList1 : objList ) {
strBuilder.append( jcrMap.get( objList1 ) );
}
return vf.createLiteral( strBuilder.toString() );
}
Object o = jcrMap.get( object );
if ( null == o ) {
return null;
}
String val = o.toString();
// see if we have a special datatype to worry about
if ( datatypes.containsKey( object ) ) {
return vf.createLiteral( val, datatypes.get( object ) );
}
return vf.createLiteral( val );
}
/**
* Gets the headers for each column and returns the property file. If this
* reader already has a prop file set (from {@link #CSVReader(java.io.File,
* gov.va.semoss.util.UriBuilder, prerna.util.UriBuilder) }, this value from
* the header file is ignored, and the pre-set file is returned in all cases.
*
* @param file the CSV file
*
* @return the control file for the CSV (last column of the header row)
*
* @throws java.io.IOException
*/
private File propCSVFile( CsvMapReader rdr ) throws IOException {
// store the headers of each of the columns
String rawheaders[] = rdr.getHeader( true );
header = new String[rawheaders.length];
// don't copy the last index of the array, because there is no corresponding
// data column (the last column is the control file location)
System.arraycopy( rawheaders, 0, header, 0, header.length - 1 );
// last header in CSV file is the absolute path to the prop file
return ( null == propFile ? new File( rawheaders[rawheaders.length - 1] )
: propFile );
}
}