DeterministicSanitizer.java

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.ostrichemulators.semtool.util;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.codec.digest.DigestUtils;

/**
 * A Sanitizer that creates valid URIs in a deterministic fashion
 *
 * @author ryan
 */
public class DeterministicSanitizer implements UriSanitizer {

	private final Pattern PAT = Pattern.compile( "([a-z])" );

	@Override
	public String sanitize( String raw ) {
		if ( RDFDatatypeTools.isValidUriChars( raw ) ) {
			return raw;
		}

		// Attempt a simple sanitizing:
		String rawWithUnderscores = raw.trim().replaceAll( " ", "_" );
		if ( RDFDatatypeTools.isValidUriChars( rawWithUnderscores ) ) {
			return rawWithUnderscores;
		}

		String md5 = DigestUtils.md5Hex( raw );
    // md5 might start with a number, so determinisitically decide on an
		// reasonably-random leading character. Here, we just add up the indexes of
		// the alpha characters, then use that to generate a new character
		Matcher m = PAT.matcher( md5 );
		int counter = 17;
		final int length = md5.length();
		final int limit = length / 2;
		while ( m.find() ) {
			long end = m.end( 1 );
			if ( end > limit ) {
				// don't want letters at the end to be too significant, so shrink them
				end -= limit;
			}

			counter += end;
		}

		counter = counter % 26;
		char leading = (char) ( 65 + counter ); // get ascii char A-Z
		return leading + md5;
	}
}