///
/// Class for modeling some compression-related properties of a data file
/// 12 Jan 2009 - D.Bozarth - Sonoma State Engineering Science
///
#region Using directives
using System;
using System.Collections.Generic;
using System.IO;
using Utility;
#endregion
namespace AnalyzeData
{
/*
* Model any file or byte stream as a histogram of byte-sized symbols
*/
public class Model
{
public const int nSymbols = 256;
public const int nDictBuildSegSize = 1048576;
public Stream mStream;
public int[] mHistogram;
public Dictionary> mDictionary;// key=byte[], value=
public Dictionary mDictHisto; // key=, value=keyCount
public Model( Stream inStream ) {
mStream = inStream;
BuildHistogram();
mDictionary = new Dictionary>();
mDictHisto = new Dictionary();
}
public Model( string fileSpec ) {
mStream = File.OpenRead( fileSpec );
BuildHistogram();
mDictionary = new Dictionary>();
mDictHisto = new Dictionary();
}
public bool BuildDictionary( int offset ) {
if ( mStream.Length <= offset + nDictBuildSegSize ) {
return false;
}
mStream.Position = offset;
string build = "";
int buildSize = 0;
for ( int i = 0; i < nDictBuildSegSize; ++i ) {
int aByte = mStream.ReadByte();
string aString = "";
if ( 0 < build.Length ) aString += ",";
aString += aByte.ToString();
build += aString;
buildSize += 1;
if ( mDictionary.ContainsKey( build ) ) {
mDictionary[build].Second += 1;
} else {
string add = build;
mDictionary.Add( add, new Pair(buildSize, 1) );
build = "";
buildSize = 0;
}
}
return true;
} // method BuildDictionary
public void BuildDictHisto() {
foreach ( KeyValuePair< string, Pair > kvp in mDictionary ) {
int len = kvp.Value.First;
int use = kvp.Value.Second;
string histKey = len.ToString() + "," + use.ToString();
if (mDictHisto.ContainsKey( histKey )) {
mDictHisto[histKey] += 1;
} else {
mDictHisto.Add( histKey, 1 );
}
}
} // method BuildDictHisto
public void BuildHistogram() {
mHistogram = new int[ nSymbols ];
while ( mStream.Position < mStream.Length ) {
int next = mStream.ReadByte();
mHistogram[next] += 1;
}
}
public double Entropy() {
double entropy = 0;
double sumFreq = 0;
double[] p = new double[ nSymbols ];
for ( int i = 0; i < nSymbols; ++i ) {
sumFreq += mHistogram[i];
}
for ( int i = 0; i < nSymbols; ++i ) {
p[i] = mHistogram[i] / sumFreq;
entropy -= p[i] * Math.Log( p[i], 2 );
}
return entropy;
} // method Entropy
public double Redundancy() {
return Math.Log( nSymbols, 2 ) - Entropy();
}
public void DictToCsv( string fileSpec ) {
File.Delete( fileSpec );
Stream stream = File.OpenWrite( fileSpec );
DictToCsv( stream );
}
public void DictToCsv( Stream outStream ) {
string strX = "";
string strY = "";
string strZ = "";
int ndx = 0;
foreach ( KeyValuePair kvp in mDictHisto ) {
string key = kvp.Key;
int comma = key.IndexOf(',');
strX += key.Substring( 0, comma );
strY += key.Substring( comma + 1 );
strZ += kvp.Value.ToString();
ndx += 1;
if ( ndx < mDictHisto.Count ) {
strX += ",";
strY += ",";
strZ += ",";
}
}
char[] carX = strX.ToCharArray();
char[] carY = strY.ToCharArray();
char[] carZ = strZ.ToCharArray();
char[] cn = Environment.NewLine.ToCharArray();
byte[] bn = new byte[cn.Length];
byte[] bar = new byte[carX.Length + carY.Length + carZ.Length + 2 * cn.Length];
ndx = 0;
for ( int i = 0; i < cn.Length; ++i ) {
bn[i] = (byte) cn[i];
}
for ( int k = 0; k < carX.Length; ++k ) {
bar[ndx++] = (byte) carX[k];
}
Array.Copy( bn, 0, bar, ndx, cn.Length );
ndx += cn.Length;
for ( int k = 0; k < carY.Length; ++k ) {
bar[ndx++] = (byte) carY[k];
}
Array.Copy( bn, 0, bar, ndx, cn.Length );
ndx += cn.Length;
for ( int k = 0; k < carZ.Length; ++k ) {
bar[ndx++] = (byte) carZ[k];
}
outStream.Write( bar, 0, bar.Length );
} // method DictToCsv
public void HistToCsv( string fileSpec ) {
File.Delete( fileSpec );
Stream stream = File.OpenWrite( fileSpec );
HistToCsv( stream );
}
public void HistToCsv( Stream outStream ) {
for ( int j = 0; j < nSymbols; ++j ) {
string str = mHistogram[j].ToString();
if ( j + 1 < nSymbols ) str += ",";
char[] car = str.ToCharArray();
byte[] bar = new byte[ car.Length ];
for ( int k = 0; k < car.Length; ++k ) {
bar[k] = (byte) car[k];
}
outStream.Write( bar, 0, bar.Length );
}
}
} // class Model
} // namespace Precompress