' M \ K Compressor  http://members.aol.com/mkwebsite
' This program uses the Huffman algorithm to encode files.
'
' This program was written directly from a section in a Computer Systems
' published in 1992 without any mention of optimizations, so there
' are very few implemented.
'
' There are several recursive procedures in the encoding program,
' but they aren't used that extensively so it's not worth changing them.
' As far as speed goes, if you are going bit by bit with QBasic your
' program is not going to be that fast.  Still, with a 733 MHZ computer,
' the program encodes around 25 kilobytes of data per second and
' decodes around 32 kilobytes of data per second.
'
' Here is a summary of how the Huffman algorithm works:  In a long sequence
' of bits (a text file, for example), there will be more frequent characters
' and there will be less frequent characters.  So you count the frequencies
' of the characters and set up a "forest" of trees, each representing one
' 8 bit pattern and its respective frequency.  Then you continuously merge the
' two least-frequent trees until only one tree remains, and you have your
' Huffman tree of codes.  In the tree, every time you go descend to a left
' subtree, you add a 1, and everytime you descend to a right subtree you add a
' 0.  (You accumulate these bits)  What should happen is the more frequent
' characters will be assigned shorted binary codes (obviously less than 8 bits
' in length in order to be effective).
'
' The output files are, for the lack of a better name, called "MKZ"s.  The
' header contains some important information and is followed by the 256
' byte frequency table, followed by the bit stream that needs to be decoded.
' Decoding is done with MKDECODE.BAS, which contains a lot less code in
' an effort to be more reusable.  Use freely.


DECLARE FUNCTION StrippedName$ (in$)
DECLARE SUB WriteFreqTable ()
DECLARE SUB WriteFileData ()
DECLARE SUB BuildFreqTable ()
DECLARE SUB DisplayMoreInfo ()
DECLARE SUB DisplayInfo ()
DECLARE SUB EncodeFile ()
DECLARE SUB BuildCodeTable ()
DECLARE SUB RemoveAnyCode (CurRoot%, binary$)
DECLARE SUB BuildCodeTree ()
DECLARE SUB WriteHeader ()
DECLARE SUB WriteCodeTree ()
DECLARE SUB WriteEncodedByte ()
DECLARE FUNCTION EnabledTrees% ()
DECLARE FUNCTION FirstFreeTree% ()
DECLARE FUNCTION FindSmallestTree% ()



'*********** Begin Data Initialization *************

' Program Constants
CONST NoSubtrees = 0
CONST ChunkSize = 600

' Node type for trees.  Only the Code and Ptr fields are needed to decode
TYPE NodeType
	Code AS STRING * 1    ' The actual code
	Freq AS DOUBLE       ' this is needed for building tree - not decoding!
	Lptr AS INTEGER       ' pointer to left subtree
	Rptr AS INTEGER       ' pointer to right subtree
	GenBool AS STRING * 1 ' "boolean" for "set emulation" purposes
END TYPE

' Type for tracking byte frequencies / building code tree
TYPE TableType
	Count AS DOUBLE      ' Number of times byte appears in file
	GenBool AS STRING * 1 ' "Boolean" value
	Freq AS INTEGER      ' Rounder off frequency (1 byte)
END TYPE

' Type for the output file header
TYPE FileHeaderType
	MKZ AS STRING * 3
	Reserved AS STRING * 1
	Version AS SINGLE
	ExpectedFileName AS STRING * 12
	ExpectedFileLen AS DOUBLE
END TYPE

' Create a fileheader object
DIM SHARED File AS FileHeaderType

' The maximum size of the code tree for 8 bit patterns is 512, not all nodes
' neccesarily are used
DIM SHARED CodeTree(1 TO 512) AS NodeType

' Create a frequency table for assistance in building code tree
DIM SHARED FreqTable(0 TO 255) AS TableType

' Create a CodeTable for holding binary strings
DIM SHARED CodeTable(0 TO 255) AS STRING
DIM SHARED Byte AS STRING * 1, FreeTree AS INTEGER, Root AS INTEGER
DIM SHARED UniqueCodes, UniqueBytes, InFile$, outfile$, FileoutBuffer$
DIM SHARED OutByte AS STRING * 1, BytePos AS INTEGER, OutInt AS INTEGER
FreeTree = LBOUND(CodeTree)
DIM SHARED BitTable(0 TO 8) AS INTEGER, start!
FOR x% = 0 TO 8
	BitTable(x%) = 2 ^ x%
NEXT
'*********** End Data Initialization *************



CLS
PRINT "Molnar \ Kucalaba Productions' File Encoder v1.1"
PRINT
PRINT "Please select an input file: ";
INPUT "", InFile$
PRINT "Please select an output file: ";
INPUT "", outfile$

File.ExpectedFileName = StrippedName$(InFile$)


' The SUB EncodeFile just calls a bunch of other SUBs to actually do
' all of the encoding
CALL EncodeFile





SYSTEM
NoFile:
PRINT "Error: "; InFile$; " not found!"
SYSTEM

SUB BuildCodeTable
	' Continuously remove codes from the tree until we have a code
	' for each unique byte found in the file.  NOTE: This trashes the
	' code tree.
	DO UNTIL UniqueCodes = UniqueBytes
		CALL RemoveAnyCode(Root, dummy$)
		dummy$ = ""
		UniqueCodes = UniqueCodes + 1
	LOOP
END SUB

SUB BuildCodeTree
	' Scan the frequency table and build a "forest" of trees
	FOR x% = 0 TO 255
		' if code existed in file then create a tree for it
		IF FreqTable(x%).Freq <> 0 THEN
			CodeTree(FreeTree).Code = CHR$(x%)
			CodeTree(FreeTree).Freq = FreqTable(x%).Freq
			CodeTree(FreeTree).GenBool = "T"
			FreeTree = FreeTree + 1
			UniqueBytes = UniqueBytes + 1
		END IF
	NEXT
	' At this point the CodeTree holds all the important data

	Root = FreeTree

	' Now we have to remove the two smallest trees and merge them
	' continuously until there is only one tree left in the "forest"
	DO WHILE EnabledTrees% > 1
		' First, find the two smallest trees and disable them
		' so they don't get merged again after this one time
		Node1% = FindSmallestTree%
		CodeTree(Node1%).GenBool = "F"
	       
		Node2% = FindSmallestTree%
		CodeTree(Node2%).GenBool = "F"

		' Now we need to create a new node
		' --- make sure the tree is enabled for future merging
		CodeTree(FreeTree).GenBool = "T"
		' --- fill in the two subtree pointers
		CodeTree(FreeTree).Lptr = Node1%
		CodeTree(FreeTree).Rptr = Node2%
		' --- the frequency is now the sum of the two children
		CodeTree(FreeTree).Freq = CodeTree(Node1%).Freq + CodeTree(Node2%).Freq
		FreeTree = FreeTree + 1
		
	LOOP
		       
	Root = FreeTree - 1
END SUB

SUB BuildFreqTable
	' Verify that file exists
	ON ERROR GOTO NoFile
	OPEN InFile$ FOR INPUT AS #1
	CLOSE #1
	ON ERROR GOTO 0

	' Open file and count occurences of each character
	OPEN InFile$ FOR BINARY AS #1
		File.ExpectedFileLen = LOF(1)
		DO
			' Input in big chunks to make program go faster
			IF LOF(1) - LOC(1) > ChunkSize THEN
				chunk$ = SPACE$(ChunkSize)
			      
				' UPDATE DISPLAY
				' REMOVE THIS CODE
				LOCATE 23, 28: PRINT LOC(1)
				LOCATE 24, 30: PRINT USING "##.##"; (LOC(1) / File.ExpectedFileLen) * 100
				LOCATE 25, 28: PRINT USING "####.###"; TIMER - start!

			ELSE
				chunk$ = SPACE$(LOF(1) - LOC(1))

			END IF
		       
			GET #1, , chunk$

			' Parse chunk and count each byte totals
			WHILE LEN(chunk$) > 0
				Byte = LEFT$(chunk$, 1)
				chunk$ = RIGHT$(chunk$, LEN(chunk$) - 1)
			       
				' Increment count
				FreqTable(ASC(Byte)).Count = FreqTable(ASC(Byte)).Count + 1
				FreqTable(ASC(Byte)).GenBool = "T"
			WEND

		LOOP UNTIL LOC(1) = LOF(1)
			     
	' UPDATE DISPLAY
	' REMOVE THIS CODE
	LOCATE 23, 28: PRINT LOC(1)
	LOCATE 24, 30: PRINT USING "##.##"; (LOC(1) / File.ExpectedFileLen) * 100
	LOCATE 25, 28: PRINT USING "####.###"; TIMER - start!
       
	CLOSE #1

	' Now take counts and round them off into one byte
	FOR x% = 0 TO 255
		IF FreqTable(x%).GenBool = "T" THEN
			FreqTable(x%).Freq = ((FreqTable(x%).Count / File.ExpectedFileLen) * 254) + 1
		END IF
	NEXT

END SUB

SUB DisplayCodeTree (CurRoot%, Level%)
IF CodeTree(CurRoot%).Lptr = 0 AND CodeTree(CurRoot%).Rptr = 0 THEN
	COLOR 13, 8
	PRINT ASC(CodeTree(CurRoot%).Code); CodeTree(CurRoot%).Freq; CurRoot%
	COLOR 7, 8
ELSE
	IF CodeTree(CurRoot%).Lptr <> 0 THEN
		PRINT "Left Level "; Level%; " :";
		CALL DisplayCodeTree(CodeTree(CurRoot%).Lptr, Level% + 1)
	END IF
	IF CodeTree(CurRoot%).Rptr <> 0 THEN
		PRINT "Right Level "; Level%; " :";
		CALL DisplayCodeTree(CodeTree(CurRoot%).Rptr, Level% + 1)
	END IF
END IF
END SUB

SUB DisplayInfo
	' Displays some neat information that helps convey how
	' the algorithm works
	max# = -32767
	min# = 32767
	maxptr% = 0
	minptr% = 0

	FOR x% = 0 TO 255
		IF FreqTable(x%).Count >= max# THEN
			max# = FreqTable(x%).Count
			maxptr% = x%
		END IF
		IF FreqTable(x%).Count > 0 THEN
			IF FreqTable(x%).Count <= min# THEN
				min# = FreqTable(x%).Count
				minptr% = x%
			END IF
		END IF
	NEXT
	
	LOCATE 11, 37
	IF maxptr% > 32 THEN
		PRINT CHR$(maxptr%);
	ELSE
		PRINT "ASC("; LTRIM$(STR$(maxptr%)); ")";
	END IF
	LOCATE 12, 36: PRINT max#; "times"
	LOCATE 13, 37: PRINT CodeTable(maxptr%)
	
	LOCATE 15, 37:
	IF minptr% > 32 THEN
		PRINT CHR$(minptr%);
	ELSE
		PRINT "ASC("; LTRIM$(STR$(minptr%)); ")";
	END IF
	LOCATE 16, 36: PRINT min#; "times"
	LOCATE 17, 37: PRINT CodeTable(minptr%)
END SUB

SUB DisplayMoreInfo
	OPEN InFile$ FOR BINARY AS #1
		Size1# = LOF(1)
	CLOSE #1
	OPEN outfile$ FOR BINARY AS #1
		Size2# = LOF(1)
	CLOSE #1
	LOCATE 37, 28: PRINT Size1#
	LOCATE 38, 28: PRINT Size2#
	reduced! = (1 - (Size2# / Size1#)) * 100
	LOCATE 39, 28: PRINT Size1# - Size2#
	LOCATE 40, 29:
	PRINT LTRIM$(STR$(INT(reduced!))); "%"

	COLOR 7, 8
END SUB

SUB DisplayTables
	' Display all characters that were found in input file, their
	' frequency, and their assigned binary code.
	FOR x% = 0 TO 255
		IF FreqTable(x%).Count <> 0 THEN
			PRINT "Byte: ";
			IF x% > 32 THEN
				PRINT CHR$(x%);
			ELSE
				COLOR 13, 8
				PRINT "x";
				COLOR 7, 8
			END IF
			PRINT " ("; LTRIM$(STR$(x%)); ")"; TAB(14); " occurs ";
			COLOR 14, 8: PRINT FreqTable(x%).Count;
			COLOR 7, 8: PRINT "times. ("; FreqTable(x%).Freq; "%)";
			PRINT TAB(40); "Binary code assigned: ";
			COLOR 14, 8: PRINT CodeTable(x%): COLOR 7, 8
		END IF
	NEXT
END SUB

FUNCTION EnabledTrees%
	FOR x% = 1 TO FreeTree
		IF CodeTree(x%).GenBool = "T" THEN ctr% = ctr% + 1
	NEXT
	EnabledTrees% = ctr%
END FUNCTION

SUB EncodeFile
CLS
WIDTH 80, 50
PRINT "͵ M \ K Productions' Encoder v1.1 ͸"
PRINT "                                                                              "
PRINT " Frequency Table                 :"; TAB(80); ""
PRINT " Code Tree                       :"; TAB(80); ""
PRINT " Binary String Table             :"; TAB(80); ""
PRINT "                                                                              "
PRINT " Size of Code Tree               : "; TAB(80); "";
PRINT " Input file size                 : "; TAB(80); "";
PRINT " Number of unique 8 bit strings  :"; TAB(80); ""
PRINT "                                                                              "
PRINT " Most Frequent 8-bit string...   :"; TAB(80); ""
PRINT " ...occurs in the input file     :"; TAB(80); ""
PRINT " ...and was assigned binary code :"; TAB(80); ""
PRINT "                                                                              "
PRINT " Least Frequent 8-bit string...  :"; TAB(80); ""
PRINT " ...occurs in the input file     :"; TAB(80); ""
PRINT " ...and was assigned binary code :"; TAB(80); ""
PRINT "                                                                              "
PRINT " Input/Output Chunk Size         :"; ChunkSize; TAB(80); ""
PRINT "                                                                              "
PRINT "Ĵ Frequency Status Ĵ"
PRINT "                                                                              "
PRINT " Number of bytes counted : "; TAB(80); "";
PRINT " Percentage complete     :"; TAB(80); "";
PRINT " Elapsed time (seconds)  :"; TAB(80); "";
PRINT "                                                                              "
PRINT "Ĵ Encoding Status Ĵ"
PRINT "                                                                              "
PRINT " Number of bytes scanned : "; TAB(80); "";
PRINT " Number of bytes encoded : "; TAB(80); "";
PRINT " KBs encoded per second  :"; TAB(80); "";
PRINT " Percentage complete     :"; TAB(80); "";
PRINT " Elapsed time (seconds)  :"; TAB(80); "";
PRINT "                                                                              "
PRINT "Ĵ Compression Details Ĵ"
PRINT "                                                                              "
PRINT " Input file size         : "; TAB(80); "";
PRINT " Output file size        :"; TAB(80); "";
PRINT " Saved number of bytes   :"; TAB(80); "";
PRINT " Compression %           :"; TAB(80); "";
PRINT "                                                                              "
PRINT ";"



' First step is to scan a file and build the Frequency Table
start! = TIMER
CALL BuildFreqTable
LOCATE 3, 37: PRINT "built"
LOCATE 8, 36: PRINT File.ExpectedFileLen; "bytes"


' Now build the code tree from the frequency table
CALL BuildCodeTree
LOCATE 4, 37: PRINT "built"
LOCATE 9, 36: PRINT UniqueBytes
LOCATE 7, 36: PRINT Root; "nodes"

' Now build a table of binary strings used to represent each byte in the file
CALL BuildCodeTable
LOCATE 5, 37: PRINT "built"


CALL DisplayInfo




' We output the header...
CALL WriteHeader

' ...and write the frequency table
CALL WriteFreqTable

' ...and the data stream
start! = TIMER
CALL WriteFileData
CALL DisplayMoreInfo
LOCATE 43, 1: PRINT "                            [hit any key to exit]"
a$ = INPUT$(1)
END SUB

FUNCTION FindSmallestTree%
	' Find the smallest enabled tree in forest and return pointer to it
	min% = 32767
	Ptr% = FirstFreeTree%
	FOR x% = Ptr% TO FreeTree - 1
		' if code is enabled then check value
		IF CodeTree(x%).GenBool = "T" THEN
			' If find a new smallest frequency then
			' set new pointer value (to be returned)
			' and new Min and disable node
			IF CodeTree(x%).Freq < min% THEN
				Ptr% = x%
				min% = CodeTree(x%).Freq
			END IF
		END IF
	NEXT
       
	' Return the smallest pointer
	FindSmallestTree% = Ptr%
END FUNCTION

FUNCTION FirstFreeTree%
	FOR x% = 1 TO FreeTree - 1
		IF CodeTree(x%).GenBool = "T" THEN
			FirstFreeTree% = x%
			EXIT FUNCTION
		END IF
	NEXT
END FUNCTION

SUB RemoveAnyCode (CurRoot%, binary$)
' This procedure pulls out an "arbitrary" binary code in the form of a string
' of one's and zeros.  This procedure was very difficult to write since
' they are several issues that need to be addressed, namely when to kill
' links between parents and children in the tree and when to get rid of
' parents having neither a character nor children.  It is for those reasons
' that the code is pretty messy.

IF CodeTree(CurRoot%).Lptr = NoSubTree AND CodeTree(CurRoot%).Rptr = NoSubTree THEN
	' Put the binary string into the code table for easier
	' encoding
	CodeTable(ASC(CodeTree(CurRoot%).Code)) = binary$

	' Binary$ is worthless now so we use special tags on it to
	' remove the proper children
	binary$ = binary$ + "x"
ELSE
	IF CodeTree(CurRoot%).Lptr <> NoSubTree AND RIGHT$(binary$, 1) <> "y" THEN
		' If the left subtree has more children recurse further as long
		' as we didnt already finish the code
		binary$ = binary$ + "1"
		CALL RemoveAnyCode(CodeTree(CurRoot%).Lptr, binary$)
	       
		'sever link between parent and child
		IF RIGHT$(binary$, 1) = "x" THEN
			CodeTree(CurRoot%).Lptr = NoSubTree
		       
			' If the parent has other children then it gets
			' a tag that preserves its children
			IF CodeTree(CurRoot%).Rptr <> 0 THEN
				binary$ = binary$ + "y"
			END IF
		END IF
	END IF
	IF CodeTree(CurRoot%).Rptr <> NoSubTree AND RIGHT$(binary$, 1) <> "y" THEN
		binary$ = binary$ + "0"
		CALL RemoveAnyCode(CodeTree(CurRoot%).Rptr, binary$)
		IF RIGHT$(binary$, 1) = "x" THEN
			CodeTree(CurRoot%).Rptr = NoSubTree
			IF CodeTree(CurRoot%).Lptr <> NoSubTree THEN
				binary$ = binary$ + "y"
			END IF
		END IF
	END IF
END IF
END SUB

FUNCTION StrippedName$ (in$)
	' Takes a file name+path and returns only the file name
	' i.e. strips away the directory information
	work$ = in$
	WHILE INSTR(work$, "\") <> 0
		work$ = RIGHT$(work$, LEN(work$) - INSTR(work$, "\"))
	WEND
	StrippedName$ = work$
END FUNCTION

SUB WriteEncodedByte
       ' Write an encoded byte to output buffer and output buffer if needed
	FileoutBuffer$ = FileoutBuffer$ + CHR$(OutInt)
	
	IF LEN(FileoutBuffer$) = ChunkSize THEN
		LOCATE 29, 28: PRINT LOC(2)
		LOCATE 30, 28: PRINT LOF(1)
		LOCATE 31, 28: PRINT USING "####.#"; (LOF(1) / (TIMER - start! + .0001)) / 1024
		LOCATE 32, 30: PRINT USING "##.##"; (LOC(2) / File.ExpectedFileLen) * 100
		LOCATE 33, 28: PRINT USING "####.###"; TIMER - start!

		PUT #1, , FileoutBuffer$
		FileoutBuffer$ = ""
	END IF
	OutInt = 0
	BytePos = 0
END SUB

SUB WriteFileData
	' First, open both files
	OPEN InFile$ FOR BINARY AS #2
	OPEN outfile$ FOR BINARY AS #1
       
	' Start writing to file again at last position
	SEEK #1, LOF(1) + 1
	FileoutBuffer$ = ""
	OutInt = 0
	' Now loop through input file again
	DO
		IF LOF(2) - LOC(2) > ChunkSize THEN
			chunk$ = SPACE$(ChunkSize)
		ELSE
			chunk$ = SPACE$(LOF(2) - LOC(2))
		END IF
	       
		' Get byte chunk from input file
		GET #2, , chunk$
	       
		' Go through chunk, look up binary code, and encode/write bytes
		WHILE LEN(chunk$) > 0
		       
			Byte = LEFT$(chunk$, 1)
			chunk$ = RIGHT$(chunk$, LEN(chunk$) - 1)

			' Use lookup table for binary string
			binary$ = CodeTable(ASC(Byte))

			WHILE LEN(binary$) > 0
				' Pull out first bit and set the OutByte
				' correctly
				bit$ = LEFT$(binary$, 1)
		       
				IF BytePos < 8 THEN
					IF bit$ = "1" THEN
						OutInt = OutInt + BitTable(BytePos)
					END IF
					BytePos = BytePos + 1
				END IF
				IF BytePos = 8 THEN CALL WriteEncodedByte
				binary$ = RIGHT$(binary$, LEN(binary$) - 1)
			WEND
		WEND
	LOOP UNTIL LOC(2) = LOF(2)
       
	IF BytePos <> 0 THEN
		FileoutBuffer$ = FileoutBuffer$ + CHR$(OutInt)
	END IF
	IF LEN(FileoutBuffer$) > 0 THEN
		PUT #1, , FileoutBuffer$
	END IF

	CLOSE #1
	CLOSE #2
END SUB

SUB WriteFreqTable
	' Output the frequency table to file
	' The frequency table is a 255 string of bytes each
	' representing the approximate frequency of the respective
	' character in the output file.  From this information the code
	' tree can be built.

	FileoutBuffer$ = ""
	OPEN outfile$ FOR BINARY AS #1
      
		' Start writing to file again at last position
		SEEK #1, LOF(1) + 1
		FOR x% = 0 TO 255
			a$ = CHR$(FreqTable(x%).Freq)
			PUT #1, , a$
		NEXT

	CLOSE #1
END SUB

SUB WriteHeader
	' Format for *.MKZ files
	' First 3 bytes     : MKZ verification
	' Next byte         : Reserved
	' Next couple bytes : MKZ Version
	' Next 12 bytes     : Expected file name
	' Next couple bytes : Expected file length
	' Next 256 bytes    : Freq Table
	' Remainder of file : Bit stream

	File.MKZ = "MKZ"
	File.Reserved = "x"
	File.Version = 1.1
	
	
	OPEN outfile$ FOR BINARY AS #1
		IF LOF(1) <> 0 THEN
			CLOSE #1
			KILL outfile$
			OPEN outfile$ FOR BINARY AS #1
		END IF
		PUT #1, , File
	CLOSE #1
END SUB

