bopwc.c


/*
    This file is part of BOP.
    Copyright (C) 2004  Patrick Davalan

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    The GNU General Public License text is also available at
    http://www.gnu.org/
    or on the Copyright holder web site :
    http://patrick.davalan.free.fr/gnu-gpl.html
*/
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/unistd.h>

// change the include to #include <bop.2/bop.h>
#include "bop.h"
#include "bopmakeh.h"

#define DEBUG 0
//
// This program reads the words from a file and count for each word 
// the number of its occurences in the file.
// then it prints the words sorted by occurence number, the most used first.
// I still have some questions about the usefulness of this program...
// It is intended to be an example to use the bop API.
// This program deals with hashs and lists.
// 
//

static int
subScan( void * data, BoplEntry * entry )
{
    // this function is designed to be called by boplScanF for each
    // entry in the words sublists.
    // print the word.
#define N ( * ((int *) data) )
    char * word ;
    
    // print 10 words max in a line
    if ( N > 9 )
    {
        N = 0 ;
        fprintf( stdout, "\n\t") ;
    }
    N++ ;
    word = boplGetData( entry ) ;
    fprintf( stdout, "%s ", word ) ;

    return ( false ) ;  // to continue scan
#undef N
}
 
static int
listScan( void * data, BoplEntry * subList )
{
    // this function is designed to be called by boplScanB for each
    // entry in the highest level list.
    // print the occurence # of the word
    // call boplScanF to print the list of names

    data = data ;   // avoid a warning
    int n = 0 ; 
    unsigned long lCount ;
        
    lCount = * (unsigned long *) ( boplGetData( subList ) ) ;
    fprintf( stdout, "%ld\n\t", lCount ) ;
        
    // print words with the same occurence in normal ascending
    // collating sequence (i.e. Albert before Georges but Bush
    // before Einstein)
    boplScanF( &n, subList, subScan ) ;
    fprintf( stdout, "\n" ) ;

    return ( false ) ;  // to continue scan
}

static int
hToL( void * arg1 , BophEntry * hEntry )
{
    // this function is designed to be called by bophScan for each
    // entry in the hash.
    // the hash entry count is search in the list.
    // when not found, the list is updated, each entry in the list
    // is a sublist of the words matching the same count.
    // list is sorted on the count, the sublists are sorted on
    // the word
    int rc ;
    BoplEntry * list ;
    BoplEntry * lEntry ;
    BoplEntry * lSub ;
    unsigned long hCount ;
    unsigned long lCount ;
    char * hWord ;
    char * lWord ;
    size_t len ;    // word string size + 1
        
    //fprintf( stderr, "entering hToL\n" ) ;
    list = (BoplEntry *) arg1 ;
    hWord =  bophGetKey( hEntry ) ;
    len = bophGetKeyLength( hEntry ) ;
    
    // search the list
    // We could have used boplScanF()
    hCount = * ( unsigned long *) ( bophGetData( hEntry ) ) ;
    lCount = ULONG_MAX ;    // to avoid a compilation warning
    for ( lEntry = boplGetFirst( list ) ;
            ! boplIsEnd( lEntry ) ; 
            lEntry = boplGetNext( lEntry ) )
    {
        lCount = * (unsigned long *) ( boplGetData( lEntry ) ) ;
        if ( hCount > lCount ) continue ;
        break ;
    }
    // fprintf( stderr, "hCount=%ld lCount=%ld\n", hCount, lCount ) ;
    
    // was an entry found ?
    if ( boplIsEnd( lEntry ) || hCount < lCount )
    {
        // not found in list : add it as a sublist
        lSub = boplCreSubBefore( lEntry ) ;
        // put the count in the entry 
        boplCopyData( lSub,
                bophGetData( hEntry ),
                bophGetDataLength( hEntry )
            ) ;
    }
    else
    {
        lSub = lEntry ;
    }

    // Here, either an entry matching the count was found or we
    // had created one.
    // search the sublist for a matching word
    // Here too , we could have used boplScanF()
    rc = 1 ;    // in case of an empty list
    for ( lEntry = boplGetFirst( lSub ) ;
            ! boplIsEnd( lEntry ) ; 
            lEntry = boplGetNext( lEntry ) )
    {
        lWord = (char *) ( boplGetData( lEntry ) ) ;
        rc = strcmp( hWord, lWord ) ;
        if ( rc > 0 ) continue ;
        break ;
    }
    // we shouldn't find the word
    if ( rc == 0 )
    {
        bopxAbort( "word already in sublist" ) ;
    }
    // the sublist entry lEntry is either the end of list or a word >
    // Add the new word before
    lEntry = boplCreBefore( lEntry ) ;
    // put the word in the entry
    boplCopyData( lEntry, hWord, len ) ;

    return( false ) ;   // don't stop the hash scan
    
}

int 
main(int argc, char **argv)
{
    struct stat statBuf ;
    BoplHandle * lHandle ;
    BophHandle * hash ;
    BoplEntry * list ;

    int size ;

    bopmTrace( ) ; 

    if ( argc < 2 )
    {
        fprintf( stderr, "%s missing args\n", argv[0] ) ;
        fprintf( stderr, "usage : bopwc word-file [buckets]\n" ) ;
        exit ( EXIT_FAILURE ) ;
    }
    
    // try to choose a hash size
    if ( argc > 2 )
    {
        size = atoi( argv[2] ) ;
    }
    else 
    {
        if ( stat( argv[1], &statBuf ) != 0 )
        {
            bopxAbort( "cannot stat on input file" ) ;
        }
        size = 3333 + ( statBuf.st_size / 73 ) ; // why not !
    }

#if ( DEBUG > 0 )
    fprintf( stderr, "hash size %d\n", size ) ;
#endif
    
    // create Hash
    fprintf( stderr, "creating hash\n" ) ;
    if ( (hash = bophNew( NULL, "count hash", size, NULL, NULL ) ) == NULL )
    {
        fprintf( stderr,
            "bophNew failed to create a size %d hash\n",
            size ) ;
        exit ( EXIT_FAILURE ) ;
    }

    // fill the hash
    fprintf( stderr, "filling hash\n" ) ;
    if ( ! bopMakeH( hash, argv[1] ) )
    {
        bopxAbort( "while filling hash" ) ;
        exit ( EXIT_FAILURE ) ;
    }

    // create a list
    fprintf( stderr, "creating list\n" ) ;
    lHandle = boplNew( NULL, "count list object" ) ;
    list = boplNewList( lHandle ) ;
    
    // scan the hash and fill the list
    // in this case, bophScan() should return false, because it 
    // should have scanned the entire table
    fprintf( stderr, "scanning hash\n" ) ;
    if (  bophScan( list, hash, hToL ) )
    {
        bopxAbort("while scanning hash") ;
    }

    // print the list
    fprintf( stderr, "printing list\n" ) ;
    boplScanB( NULL,  list, listScan ) ;

    fprintf( stderr, "deleting hash\n" ) ;
    bophDelete( NULL, hash ) ;
    
    fprintf( stderr, "deleting list\n" ) ;
    boplDelEntry( NULL, list ) ;

    boplDelete( NULL, lHandle ); 
    
    fprintf( stderr, "exiting\n" ) ;
    
    bopmMem( ) ;

    exit( EXIT_SUCCESS ) ;
}