Strip special Microsoft Word chars from copied input
12
If you have ever written a website or java application where people paste stuff in from Word, you are likely to have hit the problem of Word using high ascii chars for "open quotes", "close quotes", reg symbol, etc...
This java class (with a single static method) can replace the most prevalent of these with normal ascii values
Hope it helps someone...
This java class (with a single static method) can replace the most prevalent of these with normal ascii values
Hope it helps someone...
/**
* <p>Title: Word Cleaner</p>
* <p>Description: Strips out all of the rubbish that Word tends to generate (open, close quotes, etc)</p>
* @author Tim Yates
* @version 1.0
*
* Based on John Walker's "Demoroniser" Perl script : http://www.fourmilab.ch/webtools/demoroniser/
*/
public class WordCleaner
{
private WordCleaner() {}
public static String runWordCleaner( String input )
{
StringBuffer sb = new StringBuffer() ;
for( int i = 0 ; i < input.length() ; i++ )
{
int c ;
switch( c = (int)input.charAt( i ) )
{
case 0x82 : sb.append( "," ) ; break ;
case 0x83 : sb.append( "f" ) ; break ;
case 0x84 : sb.append( ",," ) ; break ;
case 0x85 : sb.append( "..." ) ; break ;
case 0x88 : sb.append( "^" ) ; break ;
case 0x89 : sb.append( "ppt" ) ; break ;
case 0x8B : sb.append( "<" ) ; break ;
case 0x8C : sb.append( "Oe" ) ; break ;
case 0x91 : sb.append( "'" ) ; break ;
case 0x92 : sb.append( "'" ) ; break ;
case 0x93 : sb.append( "\"" ) ; break ;
case 0x94 : sb.append( "\"" ) ; break ;
case 0x95 : sb.append( "*" ) ; break ;
case 0x96 : sb.append( "-" ) ; break ;
case 0x97 : sb.append( "--" ) ; break ;
case 0x98 : sb.append( "~" ) ; break ;
case 0x99 : sb.append( "TM" ) ; break ;
case 0x9B : sb.append( ">" ) ; break ;
case 0x9C : sb.append( "oe" ) ; break ;
case 0xA9 : sb.append( "(c)" ) ; break ;
case 0xAE : sb.append( "(r)" ) ; break ;
case 0xBC : sb.append( "1/4" ) ; break ;
case 0xBD : sb.append( "1/2" ) ; break ;
case 0xBE : sb.append( "3/4" ) ; break ;
case 8208 : sb.append( "-" ) ; break ;
case 8209 : sb.append( "-" ) ; break ;
case 8211 : sb.append( "--" ) ; break ;
case 8212 : sb.append( "--" ) ; break ;
case 8213 : sb.append( "--" ) ; break ;
case 8214 : sb.append( "||" ) ; break ;
case 8215 : sb.append( "_" ) ; break ;
case 8216 : sb.append( "'" ) ; break ;
case 8217 : sb.append( "'" ) ; break ;
case 8218 : sb.append( "," ) ; break ;
case 8219 : sb.append( "'" ) ; break ;
case 8220 : sb.append( "\"" ) ; break ;
case 8221 : sb.append( "\"" ) ; break ;
case 8222 : sb.append( ",," ) ; break ;
case 8223 : sb.append( "\"" ) ; break ;
case 8226 : sb.append( "*" ) ; break ;
case 8227 : sb.append( ">" ) ; break ;
case 8228 : sb.append( "*" ) ; break ;
case 8229 : sb.append( ".." ) ; break ;
case 8230 : sb.append( "..." ) ; break ;
case 8231 : sb.append( "-" ) ; break ;
case 61514 : sb.append( ":-)" ) ; break ;
case 61515 : sb.append( ":-|" ) ; break ;
case 61516 : sb.append( ":-(" ) ; break ;
default : sb.append( (char)c ) ;
}
}
return sb.toString() ;
}
}
Comments
Voting
Votes Up
abigor42
ASmith
CookConsulting_com
ctiggerf
dannyboy
Grenville
i_kenneth
napyfab
Pio
RatNuShock
rugi
sundaramkumar






Some minor improvements:
The arguments should be checked; if a null is passed in, it should return null, instead of blowing up with a NullPointerException.
The StringBuffer should be created at the size of the incoming string; this will avoid the costly effort of continually resizing the StringBuffer in memory.
Lastly, those wishing to sleep at night may wrap the whole switch in exception handling, ensuring the function can do no harm.
e.g.:
Title: Word Cleaner
*Description: Strips out all of the rubbish that Word tends to generate (open, close quotes, etc)
* @author Tim Yates * @version 1.1 * Revised by Todd Cook * * Based on John Walker's "Demoroniser" Perl script : http://www.fourmilab.ch/webtools/demoroniser/ */ public class WordCleaner { private WordCleaner() {} public static String runWordCleaner( String input ) { if (input==null) return null; StringBuffer sb = new StringBuffer( input.length() ) ; try { for( int i = 0 ; i < input.length() ; i++ ) { int c ; switch( c = (int)input.charAt( i ) ) { case 0x82 : sb.append( "," ) ; break ; case 0x83 : sb.append( "f" ) ; break ; case 0x84 : sb.append( ",," ) ; break ; case 0x85 : sb.append( "..." ) ; break ; case 0x88 : sb.append( "^" ) ; break ; case 0x89 : sb.append( "ppt" ) ; break ; case 0x8B : sb.append( "<" ) ; break ; case 0x8C : sb.append( "Oe" ) ; break ; case 0x91 : sb.append( "'" ) ; break ; case 0x92 : sb.append( "'" ) ; break ; case 0x93 : sb.append( "\"" ) ; break ; case 0x94 : sb.append( "\"" ) ; break ; case 0x95 : sb.append( "*" ) ; break ; case 0x96 : sb.append( "-" ) ; break ; case 0x97 : sb.append( "--" ) ; break ; case 0x98 : sb.append( "~" ) ; break ; case 0x99 : sb.append( "TM" ) ; break ; case 0x9B : sb.append( ">" ) ; break ; case 0x9C : sb.append( "oe" ) ; break ; case 0xA9 : sb.append( "(c)" ) ; break ; case 0xAE : sb.append( "(r)" ) ; break ; case 0xBC : sb.append( "1/4" ) ; break ; case 0xBD : sb.append( "1/2" ) ; break ; case 0xBE : sb.append( "3/4" ) ; break ; case 8208 : sb.append( "-" ) ; break ; case 8209 : sb.append( "-" ) ; break ; case 8211 : sb.append( "--" ) ; break ; case 8212 : sb.append( "--" ) ; break ; case 8213 : sb.append( "--" ) ; break ; case 8214 : sb.append( "||" ) ; break ; case 8215 : sb.append( "_" ) ; break ; case 8216 : sb.append( "'" ) ; break ; case 8217 : sb.append( "'" ) ; break ; case 8218 : sb.append( "," ) ; break ; case 8219 : sb.append( "'" ) ; break ; case 8220 : sb.append( "\"" ) ; break ; case 8221 : sb.append( "\"" ) ; break ; case 8222 : sb.append( ",," ) ; break ; case 8223 : sb.append( "\"" ) ; break ; case 8226 : sb.append( "*" ) ; break ; case 8227 : sb.append( ">" ) ; break ; case 8228 : sb.append( "*" ) ; break ; case 8229 : sb.append( ".." ) ; break ; case 8230 : sb.append( "..." ) ; break ; case 8231 : sb.append( "-" ) ; break ; case 61514 : sb.append( ":-)" ) ; break ; case 61515 : sb.append( ":-|" ) ; break ; case 61516 : sb.append( ":-(" ) ; break ; default : sb.append( (char)c ) ; } } } catch (Exception e) { e.printStackTrace(); } return sb.toString() ; } } [/block]