Strip special Microsoft Word chars from copied input





12
Date Submitted Thu. Mar. 9th, 2006 4:02 AM
Revision 1 of 1
Scripter TimYates
Tags Java | String | Word
Comments 1 comments
If you have ever written a website or java application where people paste stuff in from Word, you are likely to have hit the problem of Word using high ascii chars for "open quotes", "close quotes", reg symbol, etc...

This java class (with a single static method) can replace the most prevalent of these with normal ascii values

Hope it helps someone...

/**
 * <p>Title: Word Cleaner</p>
 * <p>Description: Strips out all of the rubbish that Word tends to generate (open, close quotes, etc)</p>
 * @author Tim Yates
 * @version 1.0
 *
 * Based on John Walker's "Demoroniser" Perl script : http://www.fourmilab.ch/webtools/demoroniser/
 */

public class WordCleaner
{
  private WordCleaner() {}

  public static String runWordCleaner( String input )
  {
    StringBuffer sb = new StringBuffer() ;

    for( int i = 0 ; i < input.length() ; i++ )
    {
      int c ;
      switch( c = (int)input.charAt( i ) )
      {
        case 0x82 : sb.append( "," ) ; break ;
        case 0x83 : sb.append( "f" ) ; break ;
        case 0x84 : sb.append( ",," ) ; break ;
        case 0x85 : sb.append( "..." ) ; break ;
        case 0x88 : sb.append( "^" ) ; break ;
        case 0x89 : sb.append( "ppt" ) ; break ;
        case 0x8B : sb.append( "<" ) ; break ;
        case 0x8C : sb.append( "Oe" ) ; break ;
        case 0x91 : sb.append( "'" ) ; break ;
        case 0x92 : sb.append( "'" ) ; break ;
        case 0x93 : sb.append( "\"" ) ; break ;
        case 0x94 : sb.append( "\"" ) ; break ;
        case 0x95 : sb.append( "*" ) ; break ;
        case 0x96 : sb.append( "-" ) ; break ;
        case 0x97 : sb.append( "--" ) ; break ;
        case 0x98 : sb.append( "~" ) ; break ;
        case 0x99 : sb.append( "TM" ) ; break ;
        case 0x9B : sb.append( ">" ) ; break ;
        case 0x9C : sb.append( "oe" ) ; break ;
        case 0xA9 : sb.append( "(c)" ) ; break ;
        case 0xAE : sb.append( "(r)" ) ; break ;
        case 0xBC : sb.append( "1/4" ) ; break ;
        case 0xBD : sb.append( "1/2" ) ; break ;
        case 0xBE : sb.append( "3/4" ) ; break ;
        case 8208 : sb.append( "-" ) ; break ;
        case 8209 : sb.append( "-" ) ; break ;
        case 8211 : sb.append( "--" ) ; break ;
        case 8212 : sb.append( "--" ) ; break ;
        case 8213 : sb.append( "--" ) ; break ;
        case 8214 : sb.append( "||" ) ; break ;
        case 8215 : sb.append( "_" ) ; break ;
        case 8216 : sb.append( "'" ) ; break ;
        case 8217 : sb.append( "'" ) ; break ;
        case 8218 : sb.append( "," ) ; break ;
        case 8219 : sb.append( "'" ) ; break ;
        case 8220 : sb.append( "\"" ) ; break ;
        case 8221 : sb.append( "\"" ) ; break ;
        case 8222 : sb.append( ",," ) ; break ;
        case 8223 : sb.append( "\"" ) ; break ;
        case 8226 : sb.append( "*" ) ; break ;
        case 8227 : sb.append( ">" ) ; break ;
        case 8228 : sb.append( "*" ) ; break ;
        case 8229 : sb.append( ".." ) ; break ;
        case 8230 : sb.append( "..." ) ; break ;
        case 8231 : sb.append( "-" ) ; break ;
        case 61514 : sb.append( ":-)" ) ; break ;
        case 61515 : sb.append( ":-|" ) ; break ;
        case 61516 : sb.append( ":-(" ) ; break ;
        default : sb.append( (char)c ) ;
      }
    }
    return sb.toString() ;
  }
}
 

Tim Yates

Comments

Comments Great Snippet, with mods
Thu. Nov. 30th, 2006 1:49 PM    Newbie CookConsulting_com

Voting