RDFDatatypeTools.java
package com.ostrichemulators.semtool.util;
import static com.ostrichemulators.semtool.rdf.query.util.QueryExecutorAdapter.getDate;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.apache.xerces.util.XMLChar;
import org.openrdf.model.BNode;
import org.openrdf.model.Literal;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.datatypes.XMLDatatypeUtil;
import org.openrdf.model.impl.ValueFactoryImpl;
import org.openrdf.model.vocabulary.XMLSchema;
/**
* This class offers utility methods for converting between Objects and Values,
* as well as offering the ability to derive data types for RDF entities.
*
* @author Wayne Warren
*
*/
public class RDFDatatypeTools {
/**
* The logger for this class
*/
private static final Logger logger = Logger.getLogger( RDFDatatypeTools.class );
private static final ValueFactory vf = new ValueFactoryImpl();
public static final Pattern NAMEPATTERN
= Pattern.compile( "(?:(?:\"([^\"]+)\")|([^@]+))@([a-z-A-Z]{1,8})" );
public static final Pattern DTPATTERN
= Pattern.compile( "\"([^\\\\^]+)\"\\^\\^(.*)" );
public static final Pattern URISTARTPATTERN
= Pattern.compile( "(^[A-Za-z_-]+://).*" );
/**
* A lookup which stores the various static tags for the data types that one
* might find in an XML Schema as keys, and the corresponding native Java
* classes as values
*/
private static final Map<URI, Class<?>> TYPELOOKUP = new HashMap<>();
private static final Map<Class<?>, URI> REVTYPELOOKUP = new HashMap<>();
static {
TYPELOOKUP.put( XMLSchema.INT, Integer.class );
TYPELOOKUP.put( XMLSchema.INTEGER, Integer.class );
TYPELOOKUP.put( XMLSchema.DOUBLE, Double.class );
TYPELOOKUP.put( XMLSchema.FLOAT, Float.class );
TYPELOOKUP.put( XMLSchema.DECIMAL, Double.class );
TYPELOOKUP.put( XMLSchema.STRING, String.class );
TYPELOOKUP.put( XMLSchema.DATE, Date.class );
TYPELOOKUP.put( XMLSchema.DATETIME, Date.class );
TYPELOOKUP.put( XMLSchema.BOOLEAN, Boolean.class );
REVTYPELOOKUP.put( Integer.class, XMLSchema.INT );
REVTYPELOOKUP.put( Double.class, XMLSchema.DOUBLE );
REVTYPELOOKUP.put( Float.class, XMLSchema.FLOAT );
REVTYPELOOKUP.put( String.class, XMLSchema.STRING );
REVTYPELOOKUP.put( Date.class, XMLSchema.DATETIME );
REVTYPELOOKUP.put( Boolean.class, XMLSchema.BOOLEAN );
}
private RDFDatatypeTools() {
}
/**
* Derives the classes of a set of columns based on the row data that they
* describe
*
* @param newdata The row data described by the columns
* @param columns The number of columns to be "classed"
* @return A list (ordinal) of the classes describing the data types of the
* columns
*/
public static List<Class<?>> figureColumnClassesFromData( List<Value[]> newdata,
int columns ) {
List<Class<?>> columnClasses = new ArrayList<>();
if ( newdata.isEmpty() ) {
for ( int i = 0; i < columns; i++ ) {
columnClasses.add( Object.class );
}
}
else {
// we'd like to be able to figure out column types even
// if a row doesn't have values for every column.
// we used to determine the class of a column from it's first non-null Value,
// but that's just not always true for every element of the column, so we
// need to check until we're sure all values are the same, or we can decide
// if the column is a String or Object.
List<Integer> colsToFigure = new ArrayList<>();
for ( int i = 0; i < columns; i++ ) {
colsToFigure.add( i );
}
Class<?> arr[] = new Class<?>[columns];
// now iterate as far as we have to until we have all the column classes
Iterator<Value[]> it = newdata.iterator();
Set<Class<?>> finalClasses
= new HashSet<>( Arrays.asList( String.class, Object.class ) );
while ( !colsToFigure.isEmpty() && it.hasNext() ) {
Value[] first = it.next();
// we have a row of data, so see if it can provide a class for
// any column we don't yet have a class for
ListIterator<Integer> colit = colsToFigure.listIterator();
while ( colit.hasNext() ) {
int col = colit.next();
Value v = first[col];
Class<?> k = getClassForValue( v );
// getClassForValue returns Object when the classtype can't be determined
if ( !Object.class.equals( k ) ) {
Class<?> previousK = arr[col];
if ( null == previousK ) {
// first time we've set a value for this column
arr[col] = k;
}
else if ( previousK != k ) {
// we have a previous column class,
if ( !finalClasses.contains( previousK ) ) {
// we're not already at a "final" class, so figure out what we want
if ( finalClasses.contains( k ) ) {
// we're going to be final, so set it
arr[col] = k;
}
else {
// we have two different classes,
// assume they're irreconcilable
arr[col] = Object.class;
}
}
// else we're "final," so don't change
}
if ( finalClasses.contains( arr[col] ) ) {
colit.remove();
}
}
}
}
// remove any columns where we have a class, albeit a non-"final" one
ListIterator<Integer> li = colsToFigure.listIterator();
while ( li.hasNext() ) {
int col = li.next();
if ( null != arr[col] ) {
li.remove();
}
}
// we don't have any data for the remaining columns, so do something safe
for ( int col : colsToFigure ) {
arr[col] = Object.class;
}
columnClasses.addAll( Arrays.asList( arr ) );
}
return columnClasses;
}
/**
* Derive the data type for the value of a tabular field
*
* @param v The value for which we need to derive a class
* @return The class describing the value's data type
*/
public static Class<?> getClassForValue( Value v ) {
if ( v instanceof URI ) {
return URI.class;
}
if ( v instanceof Literal ) {
Literal l = Literal.class.cast( v );
URI dt = l.getDatatype();
return ( TYPELOOKUP.containsKey( dt )
? TYPELOOKUP.get( dt ) : String.class );
}
return Object.class;
}
/**
* Parse the data type of an XML entity based on its string content
*
* @param input The XML entity, in string form
* @return The entity instance, properly classed
*/
public static Object parseXMLDatatype( String input ) {
if ( input == null ) {
return null;
}
input = input.trim();
String[] pieces = input.split( "\"" );
if ( pieces.length != 3 ) {
return removeExtraneousDoubleQuotes( input );
}
Class<?> theClass = null;
for ( URI datatypeUri : TYPELOOKUP.keySet() ) {
if ( pieces[2].contains( datatypeUri.stringValue() ) ) {
theClass = TYPELOOKUP.get( datatypeUri );
}
}
String dataPiece = pieces[1];
if ( theClass == Double.class && XMLDatatypeUtil.isValidDouble( dataPiece ) ) {
return XMLDatatypeUtil.parseDouble( dataPiece );
}
if ( theClass == Float.class && XMLDatatypeUtil.isValidFloat( dataPiece ) ) {
return XMLDatatypeUtil.parseFloat( dataPiece );
}
if ( theClass == Integer.class && XMLDatatypeUtil.isValidInteger( dataPiece ) ) {
return XMLDatatypeUtil.parseInteger( dataPiece );
}
if ( theClass == Boolean.class && XMLDatatypeUtil.isValidBoolean( dataPiece ) ) {
return XMLDatatypeUtil.parseBoolean( dataPiece );
}
if ( theClass == Date.class && XMLDatatypeUtil.isValidDate( dataPiece ) ) {
return XMLDatatypeUtil.parseCalendar( dataPiece );
}
return removeExtraneousDoubleQuotes( input );
}
/**
* Gets the datatype of the given val. If val is null, returns null. It
* returns {@link XMLSchema#ANYURI} for a URI, {@link XMLSchema#ENTITY} for a
* BNode, and {@link Literal#getDatatype()} if it's a literal. If it's not a
* {@link Value}, then it is converted to a Value first, and reprocessed. If
* we have a literal, but a null datatype, returns {@link XMLSchema#STRING}
*
* @param val
* @return
*/
public static URI getDatatype( Object val ) {
if ( null == val ) {
return null;
}
if ( val instanceof URI ) {
return XMLSchema.ANYURI;
}
else if ( val instanceof Literal ) {
Literal l = Literal.class.cast( val );
return ( null == l.getDatatype() ? XMLSchema.STRING : l.getDatatype() );
}
else if ( val instanceof BNode ) {
return XMLSchema.ENTITY;
}
Class<?> theClass = val.getClass();
return REVTYPELOOKUP.getOrDefault( theClass, XMLSchema.STRING );
}
/**
* Gets a proper native object from a given RDF value
*
* @param value The RDF Value
* @return A proper native object
*/
public static Object getObjectFromValue( Value value ) {
if ( value == null ) {
return null;
}
Class<?> theClass = getClassForValue( value );
if ( URI.class == theClass ) {
return value;
}
Literal input = Literal.class.cast( value );
String val = input.getLabel();
boolean isempty = val.isEmpty();
if ( theClass == Double.class ) {
return ( isempty ? null : input.doubleValue() );
}
if ( theClass == Integer.class ) {
return ( isempty ? null : input.intValue() );
}
if ( theClass == Boolean.class ) {
return ( isempty ? null : input.booleanValue() );
}
if ( theClass == Float.class ) {
return ( isempty ? null : input.floatValue() );
}
if ( theClass == Date.class ) {
return ( isempty ? null : getDate( input.calendarValue() ) );
}
return input.stringValue();
}
/**
* Gets a proper native object from a given RDF value
*
* @param value The RDF Value
* @return A proper native object
*/
public static Number getNumberFromValue( Value value ) {
if ( value == null ) {
return null;
}
Class<?> theClass = getClassForValue( value );
if ( URI.class == theClass ) {
return null;
}
Literal input = Literal.class.cast( value );
String val = input.getLabel();
boolean isempty = val.isEmpty();
if ( theClass == Double.class ) {
return ( isempty ? null : input.doubleValue() );
}
if ( theClass == Integer.class ) {
return ( isempty ? null : input.intValue() );
}
if ( theClass == Boolean.class ) {
return ( isempty ? 0 : 1 );
}
if ( theClass == Float.class ) {
return ( isempty ? null : input.floatValue() );
}
if ( theClass == Date.class ) {
return ( isempty ? null : getDate( input.calendarValue() ).getTime() );
}
return null;
}
public static boolean isNumericValue( Value value ) {
if ( value == null ) {
return false;
}
Set<Class<?>> numberTypes
= new HashSet<>( Arrays.asList( Double.class, Integer.class, Float.class ) );
return numberTypes.contains( getClassForValue( value ) );
}
/**
* Converts a native object instance to its equivalent RDF value
*
* @param o The native object to be converted
* @return A proper RDF Value
*/
public static Value getValueFromObject( Object o ) {
if ( null == o ) {
return null;
}
if ( o instanceof Value ) {
return Value.class.cast( o );
}
if ( o instanceof String ) {
return vf.createLiteral( String.class.cast( o ) );
}
else if ( o instanceof Double ) {
return vf.createLiteral( Double.class.cast( o ) );
}
else if ( o instanceof Integer ) {
return vf.createLiteral( Integer.class.cast( o ) );
}
else if ( o instanceof Boolean ) {
return vf.createLiteral( Boolean.class.cast( o ) );
}
else if ( o instanceof Date ) {
return vf.createLiteral( Date.class.cast( o ) );
}
else if ( o instanceof Float ) {
return vf.createLiteral( Float.class.cast( o ) );
}
logger.warn( "unhandled data type for object: " + o );
return null;
}
/**
* Internal convenience method to eliminate unnecessary quotes
*
* @param input The input containing potentially unnecessary quote chars
* @return The string content without the unnecessary quotes
*/
private static String removeExtraneousDoubleQuotes( String input ) {
while ( input != null && input.length() > 2
&& input.charAt( 0 ) == '\"'
&& input.charAt( input.length() - 1 ) == '\"' ) {
input = input.substring( 1, input.length() - 1 );
}
return input;
}
public static boolean isValidUriChars( String raw ) {
// Check if character is valid in the localpart (http://en.wikipedia.org/wiki/QName)
// NC is "non-colonized" name: http://www.w3.org/TR/xmlschema-2/#NCName
return XMLChar.isValidNCName( raw );
// return VALIDCHARS.matcher( raw ).matches();
}
/**
* Derives an RDF Value from a proper datatype and the stringified version of
* the content
*
* @param datatype URI describing the datatype of the RDF entity
* @param content The stringified version of the value
* @return A proper RDF value
*/
public static Value getValueFromDatatypeAndString( URI datatype, String content ) {
return vf.createLiteral( content, datatype );
}
public static List<Value> sortValues( Collection<Value> vals ) {
List<Value> values = new ArrayList<>( vals );
Collections.sort( values, new Comparator<Value>() {
@Override
public int compare( Value v1, Value v2 ) {
Number n1 = getNumberFromValue( v1 );
Number n2 = getNumberFromValue( v2 );
double diff = n1.doubleValue() - n2.doubleValue();
if ( diff < 0 ) {
return -1;
}
else if ( diff > 0 ) {
return 1;
}
return 0;
}
} );
return values;
}
public static URI getUriFromRawString( String raw, Map<String, String> namespaces ) {
//resolve namespace
URI uri = null;
if ( raw.startsWith( "<" ) && raw.endsWith( ">" ) ) {
uri = vf.createURI( raw.substring( 1, raw.length() - 1 ) );
return uri;
}
// if raw starts with <something>://, then assume it's just a URI
Matcher m = URISTARTPATTERN.matcher( raw );
if ( m.matches() ) {
return vf.createURI( raw );
}
if ( raw.contains( ":" ) ) {
String[] pieces = raw.split( ":" );
if ( 2 == pieces.length ) {
String namespace = namespaces.get( pieces[0] );
if ( null == namespace || namespace.trim().isEmpty() ) {
logger.warn( "No namespace found for raw value: " + raw );
}
else {
uri = vf.createURI( namespace, pieces[1] );
}
}
else {
logger.warn( "cannot resolve namespace for: " + raw + " (too many colons)" );
}
}
//else {
// since this will will always throw an error (it can't be an absolute URI)
// we'll just return null, as usual
//uri = vf.createURI( raw );
//}
return uri;
}
public static Value getRDFStringValue( String rawval, Map<String, String> namespaces,
ValueFactory vf ) {
// if rawval looks like a URI, assume it is
Matcher urimatcher = URISTARTPATTERN.matcher( rawval );
if ( urimatcher.matches() ) {
return vf.createURI( rawval );
}
Matcher m = NAMEPATTERN.matcher( rawval );
String val;
String lang;
if ( m.matches() ) {
String g1 = m.group( 1 );
String g2 = m.group( 2 );
val = ( null == g1 ? g2 : g1 );
lang = m.group( 3 );
}
else {
val = rawval;
lang = "";
m = DTPATTERN.matcher( rawval );
if ( m.matches() ) {
val = m.group( 1 );
String typestr = m.group( 2 );
try {
URI type = getUriFromRawString( typestr, namespaces );
if ( null == type ) {
logger.warn( "probably misinterpreting as string (unknown type URI?) :"
+ rawval );
val = rawval;
}
else {
return vf.createLiteral( val, type );
}
}
catch ( Exception e ) {
logger.warn( "probably misinterpreting as string (unknown type URI?) :"
+ rawval, e );
val = rawval;
}
}
}
return ( lang.isEmpty() ? vf.createLiteral( val )
: vf.createLiteral( val, lang ) );
}
}