(*

This script will convert high bit WinLatin1 (code page 1252) characters (128-255)
to their unicode character entity equivalents. Based on the following document:
http://www.microsoft.com/globaldev/reference/sbcs/1252.mspx

Dropping files on this will edit them in place. If any file does not seem to be a text
file, it is skipped. If all files dropped on this do not look like text files, the user is
alerted and the perl is not run.

Just running the script will edit what's on the clipboard.

	--CBT: 2007-05-11
*)


on open these_items
	set paths to "" as string
	
	repeat with i from 1 to the count of these_items
		set this_item to item i of these_items
		set posix_path to POSIX path of this_item
		set paths to paths & posix_path & (ASCII character 10)
	end repeat
	
	set args to do shell script "{
cat <<EO_PATHS
" & paths & "
EO_PATHS
} |
while read apath ; do
	# Only echo paths to real text files.
	if [[ `file -b \"$apath\"` == *ASCII\\ text* ]] ; then
		printf \"\\\"$apath\\\" \"
	fi
done
"
	
	if (args = "") then
		-- If args is empty then we dropped only non-text file(s).
		-- Passing no args may cause perl to hang while reading from stdin.
		display dialog "Non-text file(s) encountered:
" & paths buttons {"Ok"} default button 1
	else
		-- Else must have some valid text files to process.
		do shell script perl_script(args)
	end if
	
end open


on run
	do shell script "
		# Feed perl the contents of our clipboard and copy the results back:
		pbpaste | " & perl_script("") & " | pbcopy
	"
end run


on perl_script(args)
	return "perl -pwi -e '

	# Define an array for double byte unicode characters.
	# Undefined characters are marked as 0.
	BEGIN {
		@uni = (
			8364, 0, 8218, 402, 8222, 8230, 8224, 8225,
			710, 8240, 352, 8249, 338, 0, 381, 0, 0, 
			8216, 8217, 8220, 8221, 8226, 8211, 8212, 
			732, 8482, 353, 8250, 339, 0, 382, 376
		);
	}

	# Windows code page 1252 characters 128 through 159 are mixed set of double byte unicode
	# characters, so get these out of our $uni array. Undefined characters in this range are deleted.
	s/([\\x80-\\x9f])/ $uni[ord($1)-128] ? sprintf(\"&#%d;\", $uni[ord($1)-128]) : \"\"/eg;

	# Characters 160 through 255 can be used as is.
	s/([\\xa0-\\xff])/sprintf(\"&#%d;\", ord($1))/eg

	' " & args
end perl_script

open win2unicode in the applescript editor