EngineConsistencyChecker.java

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.ostrichemulators.semtool.rdf.engine.util;

import com.ostrichemulators.semtool.rdf.engine.api.IEngine;
import com.ostrichemulators.semtool.rdf.query.util.impl.VoidQueryAdapter;
import com.ostrichemulators.semtool.util.MultiMap;
import java.io.Closeable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.swing.text.Document;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.spell.PlainTextDictionary;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.openrdf.model.URI;
import org.openrdf.model.ValueFactory;
import org.openrdf.query.BindingSet;

/**
 * Checks the values from a Loading Sheet against the values in an existing
 * engine. This is a string-based checking, based on a particular sound
 * algorithm
 *
 * @author ryan
 */
public class EngineConsistencyChecker {

	private static final Logger log = Logger.getLogger( EngineConsistencyChecker.class );

	public static enum Type {

		CONCEPT, RELATIONSHIP
	};

	private final IEngine engine;
	private final boolean across;
	private final StringDistance strdist;
	private final Map<URI, String> labels = new HashMap<>();
	private final Map<URI, URI> uriToTypeLkp = new HashMap<>();
	private final MultiMap<URI, URI> typeToURILkp = new MultiMap<>();

	public EngineConsistencyChecker( IEngine eng, boolean across, StringDistance dist ) {
		this.engine = eng;
		this.across = across;
		this.strdist = dist;
	}

	public void release() {
		labels.clear();
		uriToTypeLkp.clear();
		typeToURILkp.clear();
	}

	/**
	 * Adds the given uris as the specified type
	 *
	 * @param uris A collection of concept classes (not instances)
	 * @param type
	 */
	public void add( Collection<URI> uris, Type type ) {
		if ( Type.CONCEPT == type ) {
			for ( URI uri : uris ) {
				makeConceptDocuments( uri );
			}
		}
		else {
			for ( URI uri : uris ) {
				makeRelationDocuments( uri );
			}
		}
	}

	private void makeConceptDocuments( URI concept ) {

		String query = "SELECT DISTINCT ?s ?slabel WHERE { ?s a ?concept ; rdfs:label ?slabel } ORDER BY ?s";
		VoidQueryAdapter vqa = new VoidQueryAdapter( query ) {
			URI lastS = null;
			Document currentDoc = null;
			Set<String> seenLabels = new HashSet<>();

			@Override
			public void handleTuple( BindingSet set, ValueFactory fac ) {
				URI s = URI.class.cast( set.getValue( "s" ) );
				if ( s != lastS ) {
					seenLabels.clear();
					typeToURILkp.add( concept, s );
					uriToTypeLkp.put( s, concept );
					lastS = s;
				}

				String label = set.getValue( "slabel" ).stringValue();
				// don't add multiple copies of the same label
				if ( !seenLabels.contains( label ) ) {
					seenLabels.add( label );
					labels.put( s, label );
				}
			}
		};

		vqa.bind( "concept", concept );
		// log.debug( vqa.bindAndGetSparql() );
		engine.queryNoEx( vqa );
	}

	private void makeRelationDocuments( URI superclass ) {
		// get all suclasses of superclass
		String query = "SELECT DISTINCT ?rel ?label WHERE {\n"
				+ " ?rel rdfs:subPropertyOf ?superclass ; rdfs:label ?label .\n"
				+ " FILTER( ?rel != ?superclass )\n"
				+ "} ORDER BY ?rel";
		VoidQueryAdapter vqa = new VoidQueryAdapter( query ) {
			URI lastRel = null;
			Set<String> seenLabels = new HashSet<>();

			@Override
			public void handleTuple( BindingSet set, ValueFactory fac ) {
				URI rel = URI.class.cast( set.getValue( "rel" ) );
				if ( rel != lastRel ) {
					seenLabels.clear();

					// add the URI information now; seenLabels later (avoid saving too much data)
					typeToURILkp.add( superclass, rel );
					uriToTypeLkp.put( rel, superclass );
					lastRel = rel;
				}

				String label = set.getValue( "label" ).stringValue();
				// don't add multiple copies of the same label
				if ( !seenLabels.contains( label ) ) {
					seenLabels.add( label );
					labels.put( rel, label );
				}
			}
		};

		vqa.bind( "superclass", superclass );
		log.debug( vqa.bindAndGetSparql() );
		engine.queryNoEx( vqa );
	}

	public int getItemsForType( URI uri ) {
		return typeToURILkp.getNN( uri ).size();
	}

	/**
	 * Resolves "near" matches from the elements of the given type. If
	 * {@link #across} is <code>true</code>, each element will be compared to all
	 * elements of all types.
	 *
	 * @param uri the concept/relation class (not instance) to resolve
	 * @param minDistance the minimum allowable similarity
	 * @return map of uri-to-hits
	 */
	public MultiMap<URI, Hit> check( URI uri, final float minDistance ) {
		MultiMap<URI, Hit> hits = new MultiMap<>();

		// get our universe of possible hits
		Map<URI, String> possibles = getHitUniverse( uri );
		MultiMap<String, URI> revpos = MultiMap.flip( possibles );

		Directory ramdir = new RAMDirectory();
		StandardAnalyzer analyzer = null;
		SpellChecker speller = null;

		List<URI> errors = new ArrayList<>();
		try {
			analyzer = new StandardAnalyzer();
			IndexWriterConfig config = new IndexWriterConfig( analyzer );
			speller = new SpellChecker( ramdir, strdist );

			StringBuilder names = new StringBuilder();
			for ( String s : possibles.values() ) {
				names.append( s ).append( "\n" );
			}
			PlainTextDictionary ptd = new PlainTextDictionary( new StringReader( names.toString() ) );
			speller.indexDictionary( ptd, config, true );

			List<URI> needles = typeToURILkp.get( uri );
			for ( URI needle : needles ) {
				String needlelabel = labels.get( needle );
				try {
					String[] suggestions = speller.suggestSimilar( needlelabel, 20, minDistance );
					for ( String s : suggestions ) {
						// found a match, so figure out what we actually matched
						float distance = strdist.getDistance( needlelabel, s );

						for ( URI match : revpos.get( s ) ) {
							hits.add( needle,
									new Hit( match, s, uriToTypeLkp.get( match ), distance ) );
						}
					}
				}
				catch ( Exception e ) {
					// our fallback resolution always works; it's just a ton slower
					errors.add( needle );
				}
			}
		}
		catch ( Exception e ) {
			log.error( e, e );
		}
		finally {
			for ( Closeable c : new Closeable[]{ analyzer, ramdir, speller } ) {
				if ( null != c ) {
					try {
						c.close();
					}
					catch ( Exception e ) {
						log.warn( e, e );
					}
				}
			}
		}

		if ( !errors.isEmpty() ) {
			fallbackResolve( errors, possibles, hits, strdist, minDistance );
		}

		return hits;
	}

	/**
	 * Resolves terms that could not be resolved with the lucene approach. This
	 * brute-force function is significantly slower, but always works
	 *
	 * @param needles the URIs that produced errors in lucene
	 * @param possibles the set of all possible solutions
	 * @param hits populate this multimap with matches
	 * @param levy the string distance object to use to measure hits
	 * @param minDistance the minimum similarity measure
	 */
	private void fallbackResolve( Collection<URI> needles, Map<URI, String> possibles,
			MultiMap<URI, Hit> hits, StringDistance levy, float minDistance ) {
		log.debug( "falling back to resolve " + needles.size() + " items" );

		for ( URI needle : needles ) {
			String needlelabel = labels.get( needle );

			for ( Map.Entry<URI, String> en : possibles.entrySet() ) {
				URI match = en.getKey();
				String matchlabel = en.getValue();

				float distance = levy.getDistance( needlelabel, matchlabel );
				if ( distance >= minDistance && !match.equals( needle ) ) {
					hits.add( needle,
							new Hit( match, matchlabel, uriToTypeLkp.get( match ), distance ) );
				}
			}
		}
	}

	private Map<URI, String> getHitUniverse( URI type ) {
		Map<URI, String> possibles = new HashMap<>();
		if ( across ) {
			possibles.putAll( labels );
		}
		else {
			for ( URI key : typeToURILkp.getNN( type ) ) {
				possibles.put( key, labels.get( key ) );
			}
		}

		return possibles;
	}

	public class Hit {

		private final URI match;
		private final String matchLabel;
		private final URI matchType;
		private final float score;

		public Hit( URI match, String matchLabel, URI matchType, float score ) {
			this.match = match;
			this.matchLabel = matchLabel;
			this.matchType = matchType;
			this.score = score;
		}

		public URI getMatch() {
			return match;
		}

		public String getMatchLabel() {
			return matchLabel;
		}

		public URI getMatchType() {
			return matchType;
		}

		public float getScore() {
			return score;
		}

		@Override
		public String toString() {
			return "Hit (" + score + ": " + matchLabel + "," + match + " ->" + matchType + ")";
		}
	}
}