/** Boyer-Moore version 3, a UNICODE Boyer-Moore pattern search class. Copyright (C) 2001 Doyle B. Myers This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA As a special exception, Doyle B. Myers gives permission to link this program with The Java platform and distribute the resulting executable, without including the source code for The Java platform in the source distribution. The phrase "The Java platform" specifically means the contents of all java.* packages as provided by Sun for either Commercial Use or under the Sun Community Source Code License. */ /** To use, either call the static matchAll() method, or instantiate a BoyerMoore with the desired pattern and case sensitivity, then call findAll( text) or setText( text) then the no-arg findAll(). All of these do the same thing, the static method is provided for convenience in a one-off search; the findAll that takes a text argument just calls setText and the no-arg flavor. All searches return a Vector of all matches. All results are 0-based (i.e., a match at the start of the text is reported as index 0). An empty Vector (not null) is returned when the pattern is not found in the text. Pattern and text are instances of java.lang.String. The whole UNICODE space is searched, so we use one rather large 64K int (256K byte) char[] for mBadCharSkipA. It's initialized when the pattern is set (in the constructor), so needless to say its really good if you can reuse a search pattern across multiple texts. To reuse a pattern, call setText( text ) and findAll, or findAll( text ). Case-insensitive searches are more expensive than case-sensitive (we make an uppercase copy of the pattern and text for case-insensitive searches). Changing the pattern in any way (including the ignore case setting) requires the skip arrays to be recomputed, which is essentially all of the work done when BoyerMoore is instantiated, so if you want to change the pattern or the case sensitivity, make a new BoyerMoore with the new pattern. */ /** Refrences: Charras, C., and Leroq, T., 1998. Exact string matching algorithms. Laboratoire d'Informatique de Rouen, UniversitŽ de Rouen, FacultŽ des Sciences et de Techniques, 76821 Mont-Saint-Aignan Cedex, FRANCE

Boyer, R. S., and Moore, J. S., 1977. A fast string searching algorithm. Comm. ACM. 20:762-772. */ package org.jbond.boyermoore; import java.util.Vector; public class BoyerMoore { private static final int ALPHABET_SIZE = 0x10000; // UNICODE alphabet size (64K chars) private int[] mBadCharSkipA; // one of the skip arrays (mismatched text char not in pattern) private int[] mGoodSuffixSkipA; // the other skip array (suffix, mismatched text char found in pattern) private char[] mPatternA; // chars of the pattern (what we're searching for) private int mPatternLength; // length of pattern private char[] mTextA; // chars of the text (the text we're searching in) private int mTextLength; // length of text private boolean mIgnoreCase; /** Convenience static method for finding all matches of inPattern within inText. */ static public Vector matchAll( String inPattern, String inText, boolean inIgnoreCase ) { BoyerMoore bm = new BoyerMoore( inPattern, inIgnoreCase ); return bm.findAll( inText ); } /** Allows for case-insensitive search. * * @param inPattern the pattern to search for * @param inIgnoreCase true to ignore case (case-insensitive search) */ public BoyerMoore( String inPattern, boolean inIgnoreCase ) { mIgnoreCase = inIgnoreCase; if( inIgnoreCase ) { inPattern = inPattern.toUpperCase(); } mPatternA = inPattern.toCharArray(); mPatternLength = mPatternA.length; mBadCharSkipA = new int[ ALPHABET_SIZE ]; // set to zero for free mGoodSuffixSkipA = new int[ mPatternLength + 1 ]; // set to zero for free // recompute these arrays whenever the pattern changes computeGoodSuffixSkipArray(); computeBadCharSkipArray(); } /** Compute the good prefix skip array. * The resulting mGoodSuffixSkipA is one of the important Boyer-Moore * data structures. Print it out if you want to check program operation. */ private void computeGoodSuffixSkipArray() { int i; int j; int p; int[] f = new int[ mPatternLength + 1 ]; j = mPatternLength + 1; f[ mPatternLength ] = j; for( i = mPatternLength; i > 0; i-- ) { while( j <= mPatternLength && mPatternA[ i - 1 ] != mPatternA[ j - 1 ] ) { if( mGoodSuffixSkipA[ j ] == 0 ) { mGoodSuffixSkipA[ j ] = j - i; } j = f[ j ]; } f[ i - 1 ] = --j; } p = f[ 0 ]; for( j = 0; j <= mPatternLength; ++j ) { if( mGoodSuffixSkipA[ j ] == 0 ) { mGoodSuffixSkipA[ j ] = p; } if( j == p ) { p = f[ p ]; } } } /** Compute the bad character skip array. * The resulting mBadCharSkipA is one of the important Boyer-Moore * data structures. Print it out if you want to check program operation. */ private void computeBadCharSkipArray() { for( int a = 0; a < ALPHABET_SIZE; a++ ) { mBadCharSkipA[ a ] = mPatternLength; } for( int j = 0; j < mPatternLength - 1; j++ ) { mBadCharSkipA[ mPatternA[ j ] ] = mPatternLength - j - 1; } } /** Future calls to findAll will search for the pattern in the text specified in inText. * The old text (if any) is lost forever. Changing the text does not require recomputing * the skip arrays (but case insensitive searches require making an uppercase copy of * the text). */ public void setText( String inText ) { if( mIgnoreCase ) { inText = inText.toUpperCase(); } mTextA = inText.toCharArray(); mTextLength = mTextA.length; } /** Just a shortcut for setText( inText ) then findAll(). * Note: Future calls to findAll() will search the text specified in this call. */ public Vector findAll( String inText ) { setText( inText ); return findAll(); } /** @return a Vector contaqining the index (0-based) of the each occurance of the pattern. * Returns an empty Vector (not null!) if pattern not found. * * This is the guts of the search. */ public Vector findAll() { Vector v = new Vector(); int i; int j; i = 0; while( i <= mTextLength - mPatternLength ) { for( j = mPatternLength - 1; j >= 0 && mPatternA[ j ] == mTextA[ i + j ]; --j ) { // note: empty loop while chars match } if( j < 0 ) { // off the left edge of the pattern, whole pattern matched // System.out.println( "Pattern match at " + i ); v.addElement( new Integer( i ) ); // add find to vector i += mGoodSuffixSkipA[ 0 ]; // always skip by suffix[0] after match // makes sense, since we know the last // comparison resulted in a match and was // therefore found in the text (therefore we // don't want to skip as a bad character); // otherwise it's exactly like a good prefix // mismatch at the 0'th position of the pattern. } else { // character mismatch, skip i += Math.max( // take the biggest of the two possible skips mGoodSuffixSkipA[ j + 1 ], // first one's suffix, second one's bad char mBadCharSkipA[ mTextA[ i + j ] ] - mPatternLength + j + 1 ); } } return v; // return all matches (empty vector if pattern not found) } }