/*
This file is part of BOP.
Copyright (C) 2004 Patrick Davalan
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
The GNU General Public License text is also available at
http://www.gnu.org/
or on the Copyright holder web site :
http://patrick.davalan.free.fr/gnu-gpl.html
*/
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/unistd.h>
// change the include to #include <bop.2/bop.h>
#include "bop.h"
#include "bopmakeh.h"
#define DEBUG 0
//
// This program reads the words from a file and count for each word
// the number of its occurences in the file.
// then it prints the words sorted by occurence number, the most used first.
// I still have some questions about the usefulness of this program...
// It is intended to be an example to use the bop API.
// This program deals with hashs and lists.
//
//
static int
subScan( void * data, BoplEntry * entry )
{
// this function is designed to be called by boplScanF for each
// entry in the words sublists.
// print the word.
#define N ( * ((int *) data) )
char * word ;
// print 10 words max in a line
if ( N > 9 )
{
N = 0 ;
fprintf( stdout, "\n\t") ;
}
N++ ;
word = boplGetData( entry ) ;
fprintf( stdout, "%s ", word ) ;
return ( false ) ; // to continue scan
#undef N
}
static int
listScan( void * data, BoplEntry * subList )
{
// this function is designed to be called by boplScanB for each
// entry in the highest level list.
// print the occurence # of the word
// call boplScanF to print the list of names
data = data ; // avoid a warning
int n = 0 ;
unsigned long lCount ;
lCount = * (unsigned long *) ( boplGetData( subList ) ) ;
fprintf( stdout, "%ld\n\t", lCount ) ;
// print words with the same occurence in normal ascending
// collating sequence (i.e. Albert before Georges but Bush
// before Einstein)
boplScanF( &n, subList, subScan ) ;
fprintf( stdout, "\n" ) ;
return ( false ) ; // to continue scan
}
static int
hToL( void * arg1 , BophEntry * hEntry )
{
// this function is designed to be called by bophScan for each
// entry in the hash.
// the hash entry count is search in the list.
// when not found, the list is updated, each entry in the list
// is a sublist of the words matching the same count.
// list is sorted on the count, the sublists are sorted on
// the word
int rc ;
BoplEntry * list ;
BoplEntry * lEntry ;
BoplEntry * lSub ;
unsigned long hCount ;
unsigned long lCount ;
char * hWord ;
char * lWord ;
size_t len ; // word string size + 1
//fprintf( stderr, "entering hToL\n" ) ;
list = (BoplEntry *) arg1 ;
hWord = bophGetKey( hEntry ) ;
len = bophGetKeyLength( hEntry ) ;
// search the list
// We could have used boplScanF()
hCount = * ( unsigned long *) ( bophGetData( hEntry ) ) ;
lCount = ULONG_MAX ; // to avoid a compilation warning
for ( lEntry = boplGetFirst( list ) ;
! boplIsEnd( lEntry ) ;
lEntry = boplGetNext( lEntry ) )
{
lCount = * (unsigned long *) ( boplGetData( lEntry ) ) ;
if ( hCount > lCount ) continue ;
break ;
}
// fprintf( stderr, "hCount=%ld lCount=%ld\n", hCount, lCount ) ;
// was an entry found ?
if ( boplIsEnd( lEntry ) || hCount < lCount )
{
// not found in list : add it as a sublist
lSub = boplCreSubBefore( lEntry ) ;
// put the count in the entry
boplCopyData( lSub,
bophGetData( hEntry ),
bophGetDataLength( hEntry )
) ;
}
else
{
lSub = lEntry ;
}
// Here, either an entry matching the count was found or we
// had created one.
// search the sublist for a matching word
// Here too , we could have used boplScanF()
rc = 1 ; // in case of an empty list
for ( lEntry = boplGetFirst( lSub ) ;
! boplIsEnd( lEntry ) ;
lEntry = boplGetNext( lEntry ) )
{
lWord = (char *) ( boplGetData( lEntry ) ) ;
rc = strcmp( hWord, lWord ) ;
if ( rc > 0 ) continue ;
break ;
}
// we shouldn't find the word
if ( rc == 0 )
{
bopxAbort( "word already in sublist" ) ;
}
// the sublist entry lEntry is either the end of list or a word >
// Add the new word before
lEntry = boplCreBefore( lEntry ) ;
// put the word in the entry
boplCopyData( lEntry, hWord, len ) ;
return( false ) ; // don't stop the hash scan
}
int
main(int argc, char **argv)
{
struct stat statBuf ;
BoplHandle * lHandle ;
BophHandle * hash ;
BoplEntry * list ;
int size ;
bopmTrace( ) ;
if ( argc < 2 )
{
fprintf( stderr, "%s missing args\n", argv[0] ) ;
fprintf( stderr, "usage : bopwc word-file [buckets]\n" ) ;
exit ( EXIT_FAILURE ) ;
}
// try to choose a hash size
if ( argc > 2 )
{
size = atoi( argv[2] ) ;
}
else
{
if ( stat( argv[1], &statBuf ) != 0 )
{
bopxAbort( "cannot stat on input file" ) ;
}
size = 3333 + ( statBuf.st_size / 73 ) ; // why not !
}
#if ( DEBUG > 0 )
fprintf( stderr, "hash size %d\n", size ) ;
#endif
// create Hash
fprintf( stderr, "creating hash\n" ) ;
if ( (hash = bophNew( NULL, "count hash", size, NULL, NULL ) ) == NULL )
{
fprintf( stderr,
"bophNew failed to create a size %d hash\n",
size ) ;
exit ( EXIT_FAILURE ) ;
}
// fill the hash
fprintf( stderr, "filling hash\n" ) ;
if ( ! bopMakeH( hash, argv[1] ) )
{
bopxAbort( "while filling hash" ) ;
exit ( EXIT_FAILURE ) ;
}
// create a list
fprintf( stderr, "creating list\n" ) ;
lHandle = boplNew( NULL, "count list object" ) ;
list = boplNewList( lHandle ) ;
// scan the hash and fill the list
// in this case, bophScan() should return false, because it
// should have scanned the entire table
fprintf( stderr, "scanning hash\n" ) ;
if ( bophScan( list, hash, hToL ) )
{
bopxAbort("while scanning hash") ;
}
// print the list
fprintf( stderr, "printing list\n" ) ;
boplScanB( NULL, list, listScan ) ;
fprintf( stderr, "deleting hash\n" ) ;
bophDelete( NULL, hash ) ;
fprintf( stderr, "deleting list\n" ) ;
boplDelEntry( NULL, list ) ;
boplDelete( NULL, lHandle );
fprintf( stderr, "exiting\n" ) ;
bopmMem( ) ;
exit( EXIT_SUCCESS ) ;
}