LowMemXlsReader.java

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.ostrichemulators.semtool.poi.main;

import com.ostrichemulators.semtool.poi.main.xlsxml.LoadingSheetXmlHandler;
import com.ostrichemulators.semtool.poi.main.xlsxml.SheetTypeXmlHandler;
import com.ostrichemulators.semtool.poi.main.xlsxml.LoaderTabXmlHandler;
import com.ostrichemulators.semtool.poi.main.xlsxml.MetadataTabXmlHandler;
import com.ostrichemulators.semtool.poi.main.ImportValidationException.ErrorType;
import com.ostrichemulators.semtool.poi.main.xlsxml.XlsXmlBase;
import com.ostrichemulators.semtool.util.MultiMap;
import com.ostrichemulators.semtool.util.Utility;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.log4j.Logger;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.StylesTable;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.Namespace;
import org.dom4j.QName;
import org.dom4j.io.SAXReader;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

/**
 * A class to read an xlsx file and produce an ImportData instance.
 *
 * @author ryan
 */
public class LowMemXlsReader {

	private static final Logger log = Logger.getLogger( LowMemXlsReader.class );

	private final LinkedHashMap<String, String> sheetNameIdLkp;
	private final List<String> sharedStrings;
	private final XSSFReader reader;
	private final OPCPackage pkg;
	private final StylesTable styles;
	private boolean lsInMem = false;

	public LowMemXlsReader( File filename ) throws IOException {
		this( new BufferedInputStream( new FileInputStream( filename ) ) );
	}

	public LowMemXlsReader( String filename ) throws IOException {
		this( new File( filename ) );
	}

	public LowMemXlsReader( InputStream stream ) throws IOException {
		log.debug( "reading with lo-mem xls reader" );
		sharedStrings = new ArrayList<>();
		try {
			pkg = OPCPackage.open( stream );
			reader = new XSSFReader( pkg );

			styles = reader.getStylesTable();

			sheetNameIdLkp = readSheetInfo( reader );
			populateSharedStrings( reader );
		}
		catch ( OpenXML4JException e ) {
			throw new IOException( "unexpected error" + e.getLocalizedMessage(), e );
		}
	}

	public void keepSheetDataInMemory( boolean b ) {
		lsInMem = b;
	}

	/**
	 * Releases resources used by this reader
	 */
	public void release() {
		try {
			pkg.close();
			sheetNameIdLkp.clear();
			sharedStrings.clear();
		}
		catch ( Exception e ) {
			log.error( e, e );
		}
	}

	/**
	 * Gets the sheet types. If a loader tab exists, only those tabs will be
	 * checked (and the metadata tab will be verified against what the loader tab
	 * says).
	 *
	 *
	 * @return
	 * @throws ImportValidationException
	 */
	public Map<String, SheetType> getSheetTypes() throws ImportValidationException {
		Map<String, SheetType> types = new HashMap<>();
		Set<String> tabsToCheck = new HashSet<>();
		boolean checktypes = false;

		try {
			XMLReader parser = XMLReaderFactory.createXMLReader();

			if ( sheetNameIdLkp.containsKey( "Loader" ) ) {
				checktypes = true;
				try ( InputStream is = reader.getSheet( sheetNameIdLkp.get( "Loader" ) ) ) {

					LoaderTabXmlHandler handler = new LoaderTabXmlHandler( sharedStrings );
					parser.setContentHandler( handler );

					InputSource sheetSource = new InputSource( is );
					parser.parse( sheetSource );

					types.putAll( handler.getSheetTypes() );
					tabsToCheck.addAll( types.keySet() );

					if ( tabsToCheck.isEmpty() ) {
						throw new ImportValidationException( ErrorType.MISSING_DATA,
								"No data to process" );
					}
				}
			}
			else {
				tabsToCheck.addAll( sheetNameIdLkp.keySet() );
			}

			// now check the actual sheets
			SheetTypeXmlHandler handler = new SheetTypeXmlHandler( sharedStrings );
			parser.setContentHandler( handler );

			boolean seenMetadata = false; // we can only have 1 metadata tab
			for ( String sheetname : tabsToCheck ) {
				if ( !sheetNameIdLkp.containsKey( sheetname ) ) {
					throw new ImportValidationException( ErrorType.MISSING_DATA,
							"Missing sheet: " + sheetname );
				}

				try ( InputStream is = reader.getSheet( sheetNameIdLkp.get( sheetname ) ) ) {
					InputSource sheetSource = new InputSource( is );
					parser.parse( sheetSource );

					SheetType sheettype = handler.getSheetType();
					boolean sheetsaysM = ( SheetType.METADATA == sheettype );

					if ( sheetsaysM ) {
						if ( seenMetadata ) {
							throw new ImportValidationException( ErrorType.TOO_MUCH_DATA,
									"Too many metadata tabs in loading file" );
						}
						seenMetadata = true;
					}

					SheetType loadertype = types.get( sheetname );
					if ( checktypes ) {
						if ( ( SheetType.USUAL == loadertype && sheetsaysM )
								|| SheetType.METADATA == loadertype && !sheetsaysM ) {
							// if the loader or the sheet itself says its a metadata sheet,
							// then both types must agree
							throw new ImportValidationException( ErrorType.WRONG_TABTYPE,
									"Loader Sheet data type for " + sheetname
									+ " conflicts with sheet type" );
						}
					}

					types.put( sheetname, sheettype );
				}
			}
		}
		catch ( SAXException | IOException | InvalidFormatException e ) {
			log.error( e, e );
		}

		return types;
	}

	public ImportMetadata getMetadata() throws ImportValidationException {
		ImportData id = new ImportData();
		id.getMetadata().setNamespaces( Utility.DEFAULTNAMESPACES );

		try {
			XMLReader parser = XMLReaderFactory.createXMLReader();

			Map<String, SheetType> types = getSheetTypes();
			MultiMap<SheetType, String> mm = MultiMap.flip( types );

			// load the metadata sheet first, if we have one
			for ( String metasheet : mm.getNN( SheetType.METADATA ) ) {
				try ( InputStream is = reader.getSheet( sheetNameIdLkp.get( metasheet ) ) ) {
					MetadataTabXmlHandler handler
							= new MetadataTabXmlHandler( sharedStrings, id.getMetadata() );
					parser.setContentHandler( handler );

					InputSource sheetSource = new InputSource( is );
					parser.parse( sheetSource );

					id.setMetadata( handler.getMetadata() );
				}
			}
		}
		catch ( SAXException | InvalidFormatException | IOException ife ) {
			log.error( ife, ife );
		}

		return id.getMetadata();
	}

	public ImportData getData() throws ImportValidationException {
		ImportData id = new ImportData();
		id.getMetadata().setNamespaces( Utility.DEFAULTNAMESPACES );

		try {
			XMLReader parser = XMLReaderFactory.createXMLReader();

			Map<String, SheetType> types = getSheetTypes();
			MultiMap<SheetType, String> mm = MultiMap.flip( types );

			// load the metadata sheet first, if we have one
			for ( String metasheet : mm.getNN( SheetType.METADATA ) ) {
				try ( InputStream is = reader.getSheet( sheetNameIdLkp.get( metasheet ) ) ) {
					MetadataTabXmlHandler handler
							= new MetadataTabXmlHandler( sharedStrings, id.getMetadata() );
					parser.setContentHandler( handler );

					InputSource sheetSource = new InputSource( is );
					parser.parse( sheetSource );

					id.setMetadata( handler.getMetadata() );
				}

				types.remove( metasheet ); // don't reprocess in the next loop								
			}

			for ( Map.Entry<String, SheetType> typeen : types.entrySet() ) {
				String sheetname = typeen.getKey();
				String sheetid = sheetNameIdLkp.get( sheetname );
				SheetType sheettype = typeen.getValue();

				try ( InputStream is = reader.getSheet( sheetid ) ) {
					if ( SheetType.NODE == sheettype || SheetType.RELATION == sheettype ) {
						LoadingSheetXmlHandler handler
								= new LoadingSheetXmlHandler( sharedStrings, styles, sheetname,
										id.getMetadata().getNamespaces(), lsInMem );
						parser.setContentHandler( handler );

						InputSource sheetSource = new InputSource( is );
						parser.parse( sheetSource );

						LoadingSheetData lsd = handler.getSheet();
						if ( lsd.isEmpty() ) {
							lsd.release();
							throw new ImportValidationException( ErrorType.NOT_A_LOADING_SHEET,
									"Sheet " + sheetname + " contains no loadable data" );
						}
						id.add( lsd );
					}
				}
			}
		}
		catch ( SAXException | InvalidFormatException | IOException ife ) {
			log.error( ife, ife );
		}

		if ( id.isEmpty() ) {
			id.release();
			throw new ImportValidationException( ErrorType.MISSING_DATA,
					"No data to process" );
		}

		return id;
	}

	public Collection<String> getSheetNames() {
		return sheetNameIdLkp.keySet();
	}

	/**
	 * Gets sheet name-to-id mapping
	 *
	 * @param r
	 * @return
	 */
	private LinkedHashMap<String, String> readSheetInfo( XSSFReader r ) {
		LinkedHashMap<String, String> map = new LinkedHashMap<>();

		try ( InputStream is = r.getWorkbookData() ) {
			SAXReader sax = new SAXReader();
			Document doc = sax.read( is );

			Namespace ns = new Namespace( "r",
					"http://schemas.openxmlformats.org/officeDocument/2006/relationships" );

			Element sheets = doc.getRootElement().element( "sheets" );
			for ( Object sheet : sheets.elements( "sheet" ) ) {
				Element e = Element.class.cast( sheet );
				String name = e.attributeValue( "name" );
				String id = e.attributeValue( new QName( "id", ns ) );
				map.put( name, id );
			}
		}
		catch ( Exception e ) {
			log.error( e, e );
		}

		return map;
	}

	private void populateSharedStrings( XSSFReader r ) {

		try ( InputStream is = r.getSharedStringsData() ) {
			XMLReader parser = XMLReaderFactory.createXMLReader();
			ContentHandler handler = new XlsXmlBase( new ArrayList<>() ) {
				int count = 0;

				@Override
				public void startDocument() throws SAXException {
					super.startDocument();
					count = 0;
				}

				@Override
				public void endDocument() throws SAXException {
					super.endDocument();
					log.debug( count + " strings cached" );
				}

				@Override
				public void startElement( String uri, String localName, String qName,
						Attributes atts ) throws SAXException {
					setReading( "t".equals( localName ) );
					resetContents();
				}

				@Override
				public void endElement( String uri, String localName, String qName )
						throws SAXException {
					if ( isReading() ) {
						sharedStrings.add( getContents() );
						setReading( false );
						count++;
					}
				}
			};
			parser.setContentHandler( handler );
			parser.parse( new InputSource( is ) );
		}
		catch ( Exception e ) {
			log.error( e, e );
		}
	}
}