/// /// Class for modeling some compression-related properties of a data file /// 12 Jan 2009 - D.Bozarth - Sonoma State Engineering Science /// #region Using directives using System; using System.Collections.Generic; using System.IO; using Utility; #endregion namespace AnalyzeData { /* * Model any file or byte stream as a histogram of byte-sized symbols */ public class Model { public const int nSymbols = 256; public const int nDictBuildSegSize = 1048576; public Stream mStream; public int[] mHistogram; public Dictionary> mDictionary;// key=byte[], value= public Dictionary mDictHisto; // key=, value=keyCount public Model( Stream inStream ) { mStream = inStream; BuildHistogram(); mDictionary = new Dictionary>(); mDictHisto = new Dictionary(); } public Model( string fileSpec ) { mStream = File.OpenRead( fileSpec ); BuildHistogram(); mDictionary = new Dictionary>(); mDictHisto = new Dictionary(); } public bool BuildDictionary( int offset ) { if ( mStream.Length <= offset + nDictBuildSegSize ) { return false; } mStream.Position = offset; string build = ""; int buildSize = 0; for ( int i = 0; i < nDictBuildSegSize; ++i ) { int aByte = mStream.ReadByte(); string aString = ""; if ( 0 < build.Length ) aString += ","; aString += aByte.ToString(); build += aString; buildSize += 1; if ( mDictionary.ContainsKey( build ) ) { mDictionary[build].Second += 1; } else { string add = build; mDictionary.Add( add, new Pair(buildSize, 1) ); build = ""; buildSize = 0; } } return true; } // method BuildDictionary public void BuildDictHisto() { foreach ( KeyValuePair< string, Pair > kvp in mDictionary ) { int len = kvp.Value.First; int use = kvp.Value.Second; string histKey = len.ToString() + "," + use.ToString(); if (mDictHisto.ContainsKey( histKey )) { mDictHisto[histKey] += 1; } else { mDictHisto.Add( histKey, 1 ); } } } // method BuildDictHisto public void BuildHistogram() { mHistogram = new int[ nSymbols ]; while ( mStream.Position < mStream.Length ) { int next = mStream.ReadByte(); mHistogram[next] += 1; } } public double Entropy() { double entropy = 0; double sumFreq = 0; double[] p = new double[ nSymbols ]; for ( int i = 0; i < nSymbols; ++i ) { sumFreq += mHistogram[i]; } for ( int i = 0; i < nSymbols; ++i ) { p[i] = mHistogram[i] / sumFreq; entropy -= p[i] * Math.Log( p[i], 2 ); } return entropy; } // method Entropy public double Redundancy() { return Math.Log( nSymbols, 2 ) - Entropy(); } public void DictToCsv( string fileSpec ) { File.Delete( fileSpec ); Stream stream = File.OpenWrite( fileSpec ); DictToCsv( stream ); } public void DictToCsv( Stream outStream ) { string strX = ""; string strY = ""; string strZ = ""; int ndx = 0; foreach ( KeyValuePair kvp in mDictHisto ) { string key = kvp.Key; int comma = key.IndexOf(','); strX += key.Substring( 0, comma ); strY += key.Substring( comma + 1 ); strZ += kvp.Value.ToString(); ndx += 1; if ( ndx < mDictHisto.Count ) { strX += ","; strY += ","; strZ += ","; } } char[] carX = strX.ToCharArray(); char[] carY = strY.ToCharArray(); char[] carZ = strZ.ToCharArray(); char[] cn = Environment.NewLine.ToCharArray(); byte[] bn = new byte[cn.Length]; byte[] bar = new byte[carX.Length + carY.Length + carZ.Length + 2 * cn.Length]; ndx = 0; for ( int i = 0; i < cn.Length; ++i ) { bn[i] = (byte) cn[i]; } for ( int k = 0; k < carX.Length; ++k ) { bar[ndx++] = (byte) carX[k]; } Array.Copy( bn, 0, bar, ndx, cn.Length ); ndx += cn.Length; for ( int k = 0; k < carY.Length; ++k ) { bar[ndx++] = (byte) carY[k]; } Array.Copy( bn, 0, bar, ndx, cn.Length ); ndx += cn.Length; for ( int k = 0; k < carZ.Length; ++k ) { bar[ndx++] = (byte) carZ[k]; } outStream.Write( bar, 0, bar.Length ); } // method DictToCsv public void HistToCsv( string fileSpec ) { File.Delete( fileSpec ); Stream stream = File.OpenWrite( fileSpec ); HistToCsv( stream ); } public void HistToCsv( Stream outStream ) { for ( int j = 0; j < nSymbols; ++j ) { string str = mHistogram[j].ToString(); if ( j + 1 < nSymbols ) str += ","; char[] car = str.ToCharArray(); byte[] bar = new byte[ car.Length ]; for ( int k = 0; k < car.Length; ++k ) { bar[k] = (byte) car[k]; } outStream.Write( bar, 0, bar.Length ); } } } // class Model } // namespace Precompress