Format HTML text with RegExp

by ZOODUCK on 29 January 2013

For this small project I decided to have a go at styling plain text in HTML using <span> tags and RegExp.

The String prototype txtFormat(r1,r2,o1,o2) has 2 REQUIRED arguments

and 2 OPTIONAL arguments

Examples of how to call txtFormat:

The above examples would color any occurances of the word "red" in String by replacing them with:
<span style='color:#ff0000;'>red</span>

It is not possible to do this kind of text replacement using JavaScript's String.replace() method alone when replacing matches in text that contains the match AND includes more than one instance of the match, as this will result in malformed text (see Example 4).

To help explain this better, here are some examples to show when it is OK to use JavaScript's replace() and match() on their own, and when you need to be a little more creative.

Example 1 (SINGLE Regular Expression, Replacement does NOT include the match):

string
hippo tiger bear hippo
regexp (pattern)
/hippo/g
lit. all instances of hippo
result
var str = "hippo tiger bear hippo";
var text_format = str.replace(/hippo/g, "potamus");
document.write(text_format);

Example 2 (SINGLE Regular Expression, Replacement DOES include the match):

string
hippo tiger bear hippo
regexp (pattern)
/hippo/g
lit. all instances of hippo
result
var str = "hippo tiger bear hippo";
var text_format = str.replace(/hippo/g, "hippopotamus");
document.write(text_format);

That works OK if we know what the replacement is in advance. But if your Regular Expression has more than a single match condition, then for obvious reasons you will need to run a for loop and replace each match individually, as in Example 3.

Example 3 (MULTIPLE Regular Expression, Replacement includes the match):

string
hippo tiger bear
regexp (pattern)
/(hippo|tiger)/g
lit. all instances of hippo OR tiger
result
var str = "hippo tiger bear";
var match_array = str.match(/(hippo|tiger)/g);
for(var i = 0; i < match_array.length; i++){
	var match = match_array[i];
	str = str.replace(match, match + "potamus");
}
document.write(str);

Again, it works without any problems, and at this stage you may be wondering why it was necessary for me to create a custom method in the first place. (I often see examples of highly complex methods on the internet which are both unnecessary and require more code than using native functions alone). However, look what happens when we add more than one hippo to the string, as in Example 4.

Example 4 (MULTIPLE Regular Expression, Replacement includes the match, String has more than one instance of a match condition)

string
hippo tiger bear hippo tiger bear hippo
regexp (pattern)
/(hippo|tiger)/g
lit. all instances of hippo OR tiger
result
var str = "hippo tiger bear hippo tiger bear hippo";
var match_array = str.match(/(hippo|tiger)/g);
for(var i = 0; i < match_array.length; i++){
	var match = match_array[i];
	str = str.replace(match, match + "potamus");
}
document.write(str);

So because the match contains the match each time we call the replace() method on the string, it get's actioned on the first instance of the match only, adding "potamus" to the first hippo for as many times as the match (hippo) occurs in the string.

The way around this problem would be to create a new Regular Expression in the for loop, made up of the match with a "negative lookahead", for handing to the replace() method.

Note: using negative lookahead in the original Regular Expression given to the match() method would be useless, as none of the original matches have "potamus" after them.

So in the case of hippo, we would be changing the Regular Expression from the match
/hippo/
to
/hippo(?!potamus)/
- in other words, "hippo" not followed by "potamus", as illustrated in Example 5 below.

Example 5 (MULTIPLE Regular Expression, Replacement includes the match, String has more than one instance of a match condition)

string
hippo tiger bear hippo tiger bear hippo
regexp (pattern)
/(hippo|tiger)/g
lit. all instances of hippo OR tiger
result
var str = "hippo tiger bear hippo tiger bear hippo ";
var match_array = str.match(/(hippo|tiger)/g);
for(var i = 0; i < match_array.length; i++){
	var match = match_array[i];	
	var regexp = new RegExp(match + "(?!potamus)");		
	str = str.replace(regexp, match + "potamus");
}
document.write(str);
This works OK, as long as you are not trying to match any of the 11 Regular Expression metacharacters
[ \ ^ $ . | ? * + ( )

If you ARE trying to match metacharacters (or your match just so happens to contain them), then the string passed to the RegExp() method in the for loop will cause a fatal error! To prevent this from happening, you will need to check for metacharacters within the match, and escape each one with \\ (two forward slashes) before passing the match to the RegExp() method, as illustrated in Example 6.

Example 6 (MULTIPLE Regular Expression, Replacement includes the match, String has more than one instance of a match condition, Match includes Regular Expression metacharacters)

string
*hippo* tiger bear hippo *tiger* bear *hippo*
regexp (pattern)
/\*(hippo|tiger)\*/g
lit. all instances of *hippo* OR *tiger*
result
var str = "*hippo* tiger bear hippo *tiger* bear *hippo* ";
var match_array = str.match(/\*(hippo|tiger)\*/g);
for(var i = 0; i < match_array.length; i++){
	var match = match_array[i];
	var metachars_regexp = /[\[\\\^\$\.\|\?\*\+\(\)]/;
	//rebuild the match a character at a time, escaping metacharacters where necessary
	var regexp_safe_str = "";
	for(var n = 0; n < match.length; n++){
		if(match[n].match(metachars_regexp)){
			var c = "\\" + match[n];
		}else{
			var c = match[n];
		}
		regexp_safe_str += c;		
	}
	var regexp = new RegExp(regexp_safe_str + "(?!potamus)");	
	str = str.replace(regexp, match + "potamus");
}
document.getElementById('ex6').innerHTML = str;	

The txtFormat() method allows you to colour format text without having to worry about any of the issues I have pointed out in the above examples. It is not a comprehensive solution to text formatting, but if you are careful about the pattern order (always format quotes first and comments last, for example) then it can handle just about any JavaScript or PHP source code without generating any malformed text.

The next two examples show how I used txtFormat to apply styles to plain text.

Basic Example

string
if(XMLHttpRequest? xhr = new XMLHttpRequest : xhr = new ActiveXObject("Microsoft.XMLHTTP"))
result
var str = 'if(XMLHttpRequest? xhr = new XMLHttpRequest : xhr = new ActiveXObject("Microsoft.XMLHTTP"))';
// format commands (if, new)
var result = str.txtFormat(/(if|new)/, '#1975d1', 'font:italic bold 18px helvetica, sans-serif; padding-right:5px;');
document.write(result);

Advanced Example

string
if(XMLHttpRequest? xhr = new XMLHttpRequest : xhr = new ActiveXObject("Microsoft.XMLHTTP"))
result
var str = 'if(XMLHttpRequest? xhr = new XMLHttpRequest : xhr = new ActiveXObject("Microsoft.XMLHTTP"))';
// format double quotes
var result = str.txtFormat(/"[^"]*["]/, 'palegoldenrod');
// format commands (if, new)
result = result.txtFormat(/(if|new)/, '#1975d1', 'font:italic bold 18px helvetica, sans-serif; padding-right:5px;');
// format equals(=), colon(:) or question mark(?)
result = result.txtFormat(/(\s[=:]\s|\?)/, 'firebrick', 'font:bold 24px tahoma, sans-serif;');
// format braces ()
result = result.txtFormat(/[\(\)]/, 'sandybrown', 'font:bold 18px sans-serif; padding-left:2px; padding-right:2px;');
document.write(result);

Below is the entire source code for the txtFormat prototype.

//===================================================================================================================================
// txtFormat(find<regular expression>, color<hex/rgb string>, style<string>(optional), remove_style<boolean>(optional))
// example: String.textFormatColor(/var\s/g, "#1975d1", "font:bold 12px sans-serif; text-decoration:underline;")
//===================================================================================================================================
String.prototype.txtFormat = function(pattern, color, style, remove_style){
	var str = this;
	var matches = str.match(pattern);
	//============================================
	// set color (required) and style (optional)
	//============================================
	var color_style = 'color:' + color.replace(";", "") + ';';
	if(style != null){
		var span = "<span style='" + color_style + " " + style + "'>";
	}else{
		var span = "<span style='" + color_style + "'>";
	}
	//=========================================================================================================================
	// read the string a character at a time, building the section var until a match is found, then:
	// 1. format the match
	// 2. update the build string with the section var (optionally remove style attribute from all span tags within the match)
	// 3. empty the section var and start again
	//==========================================================================================================================
	var section = "";
	var build_str = "";
	for(var i = 0; i < str.length; i++){
		section += str[i];
		var match_query = section.match(pattern);
		if(match_query != null){
			if(!remove_style){
				build_str += section.replace(match_query[0], span + match_query[0] + '</span>');					
			}else{
				// format text and remove style attribute from all span tags WITHIN this match!
				build_str += section.replace(match_query[0], span + match_query[0].replace(/\sstyle=["'][^"']+["']/g, '') + '</span>');					
			}
			section = ""; // empty
		}
	}
	build_str += section; // any remaining text without matches (last part of the string)
	return build_str;
}

The script below uses txtFormat to colour the above source code - you can run it by clicking on the button

function formatSourceCode(){
	var str = document.getElementById('source_code').innerHTML.split("\n");
	var lines = "";
	for(var i = 0; i < str.length; i++){
	
		var line = str[i] + "\n"; //add back new line character (helpful when formatting single line comments)					
		
		//=======================
		// regular expressions 
		//=======================
		
		// quotations (inc escaped quotes)
		var regexp_quotes = /(("([^"]|\\")*[^\\]"|"{2})|'[^']*')/;			
		// vars
		var regexp_vars = /(var(?=\s)|else(?={)|else(?=\s)|echo(?=\s)|([^\w]if|for|foreach|function|new|include|true|false|null|die|exit|break|switch|case|return)(?=\W))/;
		// operators
		var regexp_operators = /(\+=|-=|!=|[^\/=]={1,2}\s|(>|<)[\d\s=]|&|\.(?!\w)|[\s\d][\+\*\-](?=[^=\/]))/;
		// braces
		var regexp_braces = /[\[\]{}]/;
		// patterns
		var regexp_patterns = /\/{1}(?!span).+(\/g|\/i|\/gi|\/(?=;)|\/(?=\)))(?!>)/;
		// single line comments				
		var regexp_single_line_comments = /\/\/.+\n/;
		// multi-line comments
		var regexp_multi_line_comments = /\/\*[^\*]+\*\//;					
		
		line = line.txtFormat(regexp_quotes, '#ffcc00', 'font-weight:bold');
		line = line.txtFormat(regexp_vars, '#1975d1', 'font:italic normal 18px impact, sans-serif; letter-spacing:1px;');
		line = line.txtFormat(regexp_operators, '#cc0000', 'font:normal 18px impact, sans-serif;');
		line = line.txtFormat(regexp_braces, 'rgb(255, 124, 147)');
		line = line.txtFormat(regexp_patterns, '#994e99', null, true);
		line = line.txtFormat(regexp_single_line_comments, 'gray', null, true);					
		
		// update lines
		lines += line;
	}
	
	// single line comments
	lines = lines.txtFormat(regexp_single_line_comments, 'gray', null, true);
	
	// multi line comments
	lines = lines.txtFormat(regexp_multi_line_comments, 'gray', null, true); // true = remove style attribute from all span tags INSIDE the match
				
	// replace source code with formatted text
	document.getElementById('source_code_text_format').innerHTML = lines;
	document.getElementById('source_code_text_format').style.display = 'block';
	document.getElementById('source_code').style.display = 'none';	
}