/* This file is part of BOP. Copyright (C) 2004 Patrick Davalan This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA The GNU General Public License text is also available at http://www.gnu.org/ or on the Copyright holder web site : http://patrick.davalan.free.fr/gnu-gpl.html */ #include <sys/types.h> #include <sys/stat.h> #include <sys/unistd.h> // change the include to #include <bop.2/bop.h> #include "bop.h" #include "bopmakeh.h" #define DEBUG 0 // // This program reads the words from a file and count for each word // the number of its occurences in the file. // then it prints the words sorted by occurence number, the most used first. // I still have some questions about the usefulness of this program... // It is intended to be an example to use the bop API. // This program deals with hashs and lists. // // static int subScan( void * data, BoplEntry * entry ) { // this function is designed to be called by boplScanF for each // entry in the words sublists. // print the word. #define N ( * ((int *) data) ) char * word ; // print 10 words max in a line if ( N > 9 ) { N = 0 ; fprintf( stdout, "\n\t") ; } N++ ; word = boplGetData( entry ) ; fprintf( stdout, "%s ", word ) ; return ( false ) ; // to continue scan #undef N } static int listScan( void * data, BoplEntry * subList ) { // this function is designed to be called by boplScanB for each // entry in the highest level list. // print the occurence # of the word // call boplScanF to print the list of names data = data ; // avoid a warning int n = 0 ; unsigned long lCount ; lCount = * (unsigned long *) ( boplGetData( subList ) ) ; fprintf( stdout, "%ld\n\t", lCount ) ; // print words with the same occurence in normal ascending // collating sequence (i.e. Albert before Georges but Bush // before Einstein) boplScanF( &n, subList, subScan ) ; fprintf( stdout, "\n" ) ; return ( false ) ; // to continue scan } static int hToL( void * arg1 , BophEntry * hEntry ) { // this function is designed to be called by bophScan for each // entry in the hash. // the hash entry count is search in the list. // when not found, the list is updated, each entry in the list // is a sublist of the words matching the same count. // list is sorted on the count, the sublists are sorted on // the word int rc ; BoplEntry * list ; BoplEntry * lEntry ; BoplEntry * lSub ; unsigned long hCount ; unsigned long lCount ; char * hWord ; char * lWord ; size_t len ; // word string size + 1 //fprintf( stderr, "entering hToL\n" ) ; list = (BoplEntry *) arg1 ; hWord = bophGetKey( hEntry ) ; len = bophGetKeyLength( hEntry ) ; // search the list // We could have used boplScanF() hCount = * ( unsigned long *) ( bophGetData( hEntry ) ) ; lCount = ULONG_MAX ; // to avoid a compilation warning for ( lEntry = boplGetFirst( list ) ; ! boplIsEnd( lEntry ) ; lEntry = boplGetNext( lEntry ) ) { lCount = * (unsigned long *) ( boplGetData( lEntry ) ) ; if ( hCount > lCount ) continue ; break ; } // fprintf( stderr, "hCount=%ld lCount=%ld\n", hCount, lCount ) ; // was an entry found ? if ( boplIsEnd( lEntry ) || hCount < lCount ) { // not found in list : add it as a sublist lSub = boplCreSubBefore( lEntry ) ; // put the count in the entry boplCopyData( lSub, bophGetData( hEntry ), bophGetDataLength( hEntry ) ) ; } else { lSub = lEntry ; } // Here, either an entry matching the count was found or we // had created one. // search the sublist for a matching word // Here too , we could have used boplScanF() rc = 1 ; // in case of an empty list for ( lEntry = boplGetFirst( lSub ) ; ! boplIsEnd( lEntry ) ; lEntry = boplGetNext( lEntry ) ) { lWord = (char *) ( boplGetData( lEntry ) ) ; rc = strcmp( hWord, lWord ) ; if ( rc > 0 ) continue ; break ; } // we shouldn't find the word if ( rc == 0 ) { bopxAbort( "word already in sublist" ) ; } // the sublist entry lEntry is either the end of list or a word > // Add the new word before lEntry = boplCreBefore( lEntry ) ; // put the word in the entry boplCopyData( lEntry, hWord, len ) ; return( false ) ; // don't stop the hash scan } int main(int argc, char **argv) { struct stat statBuf ; BoplHandle * lHandle ; BophHandle * hash ; BoplEntry * list ; int size ; bopmTrace( ) ; if ( argc < 2 ) { fprintf( stderr, "%s missing args\n", argv[0] ) ; fprintf( stderr, "usage : bopwc word-file [buckets]\n" ) ; exit ( EXIT_FAILURE ) ; } // try to choose a hash size if ( argc > 2 ) { size = atoi( argv[2] ) ; } else { if ( stat( argv[1], &statBuf ) != 0 ) { bopxAbort( "cannot stat on input file" ) ; } size = 3333 + ( statBuf.st_size / 73 ) ; // why not ! } #if ( DEBUG > 0 ) fprintf( stderr, "hash size %d\n", size ) ; #endif // create Hash fprintf( stderr, "creating hash\n" ) ; if ( (hash = bophNew( NULL, "count hash", size, NULL, NULL ) ) == NULL ) { fprintf( stderr, "bophNew failed to create a size %d hash\n", size ) ; exit ( EXIT_FAILURE ) ; } // fill the hash fprintf( stderr, "filling hash\n" ) ; if ( ! bopMakeH( hash, argv[1] ) ) { bopxAbort( "while filling hash" ) ; exit ( EXIT_FAILURE ) ; } // create a list fprintf( stderr, "creating list\n" ) ; lHandle = boplNew( NULL, "count list object" ) ; list = boplNewList( lHandle ) ; // scan the hash and fill the list // in this case, bophScan() should return false, because it // should have scanned the entire table fprintf( stderr, "scanning hash\n" ) ; if ( bophScan( list, hash, hToL ) ) { bopxAbort("while scanning hash") ; } // print the list fprintf( stderr, "printing list\n" ) ; boplScanB( NULL, list, listScan ) ; fprintf( stderr, "deleting hash\n" ) ; bophDelete( NULL, hash ) ; fprintf( stderr, "deleting list\n" ) ; boplDelEntry( NULL, list ) ; boplDelete( NULL, lHandle ); fprintf( stderr, "exiting\n" ) ; bopmMem( ) ; exit( EXIT_SUCCESS ) ; }