UriBuilder.java

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.ostrichemulators.semtool.util;

import java.util.Objects;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.log4j.Logger;
import org.openrdf.model.Resource;
import org.openrdf.model.URI;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.model.impl.ValueFactoryImpl;

/**
 * A class that helps to build URIs.
 *
 * @author ryan
 */
public class UriBuilder {

	private static final Logger log = Logger.getLogger( UriBuilder.class );
	private static Class<? extends UriBuilder> bldrclass = UriBuilder.class;
	private static Class<? extends UriSanitizer> saniclass = DefaultSanitizer.class;
	private final ValueFactory vf = new ValueFactoryImpl();
	private final StringBuilder content = new StringBuilder();
	private UriSanitizer sanitizer;
	private boolean lastIsConcatenator;
	private static final Pattern CONCAT_PAT = Pattern.compile( "(.*)([/#:])$" );
	private static final Pattern PUNCTUATION = Pattern.compile( "\\p{Punct}",
			Pattern.UNICODE_CHARACTER_CLASS );

	/**
	 * Sets the class to use when a user calls
	 * {@link #getBuilder(java.lang.String)} or
	 * {@link #getBuilder(org.openrdf.model.URI)}
	 *
	 * @param <T> the type of class (must extend or be UriBuilder)
	 * @param klass the class to use
	 */
	public static <T extends UriBuilder> void setFactoryClass( Class<T> klass ) {
		bldrclass = klass;
	}

	public static <T extends UriSanitizer> void setDefaultSanitizerClass( Class<T> klass ) {
		saniclass = klass;
	}

	public static <T extends UriBuilder, V extends UriSanitizer> void setFactoryClasses( Class<T> bldr, Class<V> sani ) {
		setFactoryClass( bldr );
		setDefaultSanitizerClass( sani );
	}

	/**
	 * Creates an uninitialized instance
	 */
	protected UriBuilder() {

	}

	/**
	 * Calls {@link #setBase(java.lang.String)}, and throws the same exceptions
	 *
	 * @param owl
	 */
	protected UriBuilder( String owl ) {
		setBase( owl );
	}

	/**
	 * Sets the sanitizer to use, assuming it is non-null
	 *
	 * @param s the sanitizer to use
	 */
	public final void setSanitizer( UriSanitizer s ) {
		if ( null != s ) {
			sanitizer = s;
		}
	}

	/**
	 * Sets the base URI from which to build other URIs
	 *
	 * @param owl
	 *
	 * @throws IllegalArgumentException if <code>owl</code> would lead to invalid
	 * URIs
	 */
	protected final void setBase( String owl ) {
		if ( null == owl || owl.isEmpty() ) {
			throw new IllegalArgumentException( "argument cannot be null or empty" );
		}

		// do a check to see that we have a valid URI to start with
		try {
			URI impl = vf.createURI( owl );
			content.append( impl.stringValue() );
		}
		catch ( Exception e ) {
			throw new IllegalArgumentException( "invalid URI component", e );
		}

		lastIsConcatenator = ( owl.endsWith( "/" )
				|| owl.endsWith( "#" ) || owl.endsWith( ":" ) );
	}

	/**
	 * Converts the "starter" URI into an actual URI. If the working URI ends in a
	 * separator ("/", "#", ":"), the separator is removed.
	 *
	 * @return a URI
	 */
	public URI toUri() {
		String uristart = content.toString();

		boolean removeLast = ( uristart.endsWith( "/" )
				|| uristart.endsWith( "#" ) || uristart.endsWith( ":" ) );
		return vf.createURI( removeLast
				? uristart.substring( 0, uristart.length() - 1 ) : uristart );
	}

	public URI build() {
		return toUri();
	}

	public URI build( String extra ) {
		return copy().add( sanitizer.sanitize( extra ) ).build();
	}

	public URI uniqueUri() {
		StringBuilder uristr = new StringBuilder( content );
		uristr.append( RandomStringUtils.randomAlphabetic( 1 ) );
		uristr.append( UUID.randomUUID().toString() );
		return vf.createURI( uristr.toString() );
	}

	/**
	 * Gets a new UriBuilder from this instance's content
	 *
	 * @return a new builder
	 */
	public UriBuilder copy() {
		UriBuilder bldr = new UriBuilder( content.toString() );
		try {
			bldr.setSanitizer( sanitizer.getClass().newInstance() );
		}
		catch ( InstantiationException | IllegalAccessException e ) {
			log.error( "BUG: unable to create new instance of " + sanitizer, e );
			bldr.setSanitizer( new DefaultSanitizer() );
		}

		return bldr;
	}

	/**
	 * Gets a new UriBuilder like {@link #copy()}, but with an unsanitized segment
	 *
	 * @param additional the extra stuff to add to the content. MUST BE URI-VALID
	 *
	 * @return a new copy of this instance's content, with the extra segment added
	 */
	private UriBuilder internalCopy( String additional ) {
		StringBuilder newcontent = new StringBuilder( content );
		if ( !lastIsConcatenator ) {
			newcontent.append( "/" );
		}
		newcontent.append( additional );
		UriBuilder bldr = getBuilder( newcontent.toString() );
		try {
			bldr.setSanitizer( sanitizer.getClass().newInstance() );
		}
		catch ( InstantiationException | IllegalAccessException e ) {
			log.error( "BUG: unable to create new instance of " + sanitizer, e );
			bldr.setSanitizer( new DefaultSanitizer() );
		}
		return bldr;
	}

	public static UriBuilder getBuilder( String ns ) {
		UriBuilder ldr;
		UriSanitizer sani;
		try {
			ldr = bldrclass.newInstance();
		}
		catch ( InstantiationException | IllegalAccessException e ) {
			log.error( "BUG: cannot create UriBuilder instance; using fallback", e );
			ldr = new UriBuilder();
		}

		try {
			sani = saniclass.newInstance();
		}
		catch ( InstantiationException | IllegalAccessException e ) {
			log.error( "BUG: cannot create UriSanitizer instance; using fallback", e );
			sani = new DefaultSanitizer();
		}

		ldr.setBase( ns );
		ldr.setSanitizer( sani );
		return ldr;
	}

	public static UriBuilder getBuilder( URI ns ) {
		return getBuilder( ns.stringValue() );
	}

	/**
	 * Adds a new part to the URI, after sanitizing it. If <code>localname</code>
	 * ends in a "concatenator" character, the character is preserved, but
	 * everything else is sent to the sanitizer
	 *
	 * @param localname the part to add to the URI
	 *
	 * @return this UriBuilder
	 */
	public UriBuilder add( String localname ) {
		localname = ( null == localname ? "" : localname.trim() );

		if ( localname.isEmpty() ) {
			return this;
		}

		if ( !lastIsConcatenator ) {
			content.append( "/" );
		}

		String newlocal;
		String lastchar;
		Matcher m = CONCAT_PAT.matcher( localname );
		if ( m.matches() ) {
			newlocal = m.group( 1 );
			lastchar = m.group( 2 );
			lastIsConcatenator = true;
		}
		else {
			newlocal = localname;
			lastchar = "";
			lastIsConcatenator = false;
		}

		content.append( sanitizer.sanitize( newlocal ) ).append( lastchar );

		return this;
	}

	/**
	 * Does the given string start with our current content?
	 *
	 * @param uri the uri to check
	 * @return true, if <code>uri.startsWith( content.toString() )</code>
	 */
	public boolean contains( String uri ) {
		return uri.startsWith( content.toString() );
	}

	public boolean contains( Resource uri ) {
		return contains( uri.stringValue() );
	}

	/**
	 * Checks if the given URI's namespace is the same as our current content.
	 * This is a stricter check than {@link #contains(org.openrdf.model.Resource)}
	 * because the namespace must match the this builder's content exactly, and
	 * not just start the same way
	 *
	 * @param uri the uri to check
	 * @return true, if
	 * <code>uri.getNamespace().equals( content.toString() )</code>
	 */
	public boolean namespaceOf( URI uri ) {
		return uri.getNamespace().equals( content.toString() );
	}

	/**
	 * Gets a copy of this instance, as if in the core namespace
	 *
	 * @return a copy of this instance
	 */
	public UriBuilder getCoreUri() {
		return internalCopy( "core#" );
	}

	/**
	 * A convenience {@link #getCoreUri()}.{@link #add(java.lang.String) }.
	 *
	 * @param localname the core name to add
	 *
	 * @return a copy of this instance
	 */
	public URI getCoreUri( String localname ) {
		return getCoreUri().add( localname ).build();
	}

	/**
	 * Gets a copy of this instance, as if in the relation namespace
	 *
	 * @return a copy of this instance
	 */
	public UriBuilder getRelationUri() {
		return internalCopy( "Relation/" );
	}

	/**
	 * A convenience {@link #getRelationUri() }.{@link #add(java.lang.String) }.
	 *
	 * @param localname the core name to add
	 *
	 * @return a copy of this instance
	 */
	public URI getRelationUri( String localname ) {
		return getRelationUri().add( localname ).build();
	}

	public URI getContainsUri() {
		return getRelationUri( Constants.CONTAINS );
	}

	/**
	 * Gets a copy of this instance, as if in the concept instance (does not have
	 * a trailing "/")
	 *
	 * @return a copy of this instance
	 */
	public UriBuilder getConceptUri() {
		return internalCopy( Constants.DEFAULT_NODE_CLASS );
	}

	/**
	 * Gets a copy of this instance, as if in the concept namespace (has a
	 * trailing "/")
	 *
	 * @return a copy of this instance
	 */
	public UriBuilder getConceptNamespace() {
		return internalCopy( Constants.DEFAULT_NODE_CLASS + "/" );
	}

	@Override
	public String toString() {
		return content.toString();
	}

	/**
	 * A convenience to {@link #getConceptUri()}.{@link  #add(java.lang.String)
	 * }.
	 *
	 * @param localname the core name to add
	 *
	 * @return a copy of this instance
	 */
	public URI getConceptUri( String localname ) {
		return getConceptUri().add( localname ).build();
	}

	public UriSanitizer getSanitizer() {
		return sanitizer;
	}

	@Override
	public int hashCode() {
		int hash = 7;
		hash = 29 * hash + Objects.hashCode( this.content );
		return hash;
	}

	@Override
	public boolean equals( Object obj ) {
		if ( obj == null ) {
			return false;
		}
		if ( getClass() != obj.getClass() ) {
			return false;
		}
		final UriBuilder other = (UriBuilder) obj;
		return content.toString().equals( other.content.toString() );
	}

	public static class DefaultSanitizer implements UriSanitizer {

		private final int localPartMaxLength = 54; // TBD: qualify this limit, consider making a semoss property

		public String getUUIDLocalName() {
			return RandomStringUtils.randomAlphabetic( 1 ) + UUID.randomUUID().toString();
		}

		@Override
		public String sanitize( String raw ) {
			// Check if the string is already valid:
			String sanitized = raw;
			if ( !RDFDatatypeTools.isValidUriChars( raw ) ) {
				// Attempt a simple sanitizing:

				String rawWithUnderscores
						= raw.trim().replaceAll( "(\\p{Punct}|\\s)", "_" );

				if ( RDFDatatypeTools.isValidUriChars( rawWithUnderscores ) ) {
					sanitized = rawWithUnderscores;
				}
				else {
					// Still not clean enough, just use a random URI (below)
					sanitized = "";
				}
			}

			// At issue here was a that truncating the local part at the length limit (as done previously) did not
			// guarantee uniqueness.  This lead to occasional URI collisions and the overlap of data.  So now when
			// we hit the max length limit, we graph a UUID since we are otherwise unable to check for uniqueness
			// at this stage.
			//
			return ( sanitized.length() > localPartMaxLength || sanitized.isEmpty() )
					? getUUIDLocalName() : sanitized;
		}
	}
}