波斯码BOSSMA Information Technology

C#使用正则表达式过滤script脚本程序

发布时间:2011年5月10日 / 分类:ALL, DOTNET / 20,026 次浏览 / 评论

??? 很多时候都需要过滤字符串中的javascript等脚本程序,比如防止跨站攻击,采集信息等。网上找了很多,大多不能用,或者存在很多的漏洞和问题,并且以讹传讹的垃圾站也特别多,搜出来的就是那几个,管用的好用的寥寥无几。好歹找到几个能改造下的,看了看,都是使用正则表达式来匹配替换的。正则表达式我也看过很多遍了,但是总是记不住,没办法又查了查资料,边学习边改造。然后将改造的结果公布出来,方便大家批一批!

??? 先来看看效果,是不是你想要的,也证明不是吹的。

先来熟悉几个正则表达式的语法:

\s? 空白字符,包括换行符\n、回车符\r、制表符\t、垂直制表符\v、换页符\f
\S?? \s的补集
\w? 单词字符,指大小写字母、0-9的数字、下划线
\W? \w的补集

更多正则信息,可以参考:http://www.cnblogs.com/KissKnife/archive/2008/03/23/1118423.html

看看如何过滤javascript引用或区块:

有时候javascript会写到Dom元素的鼠标事件中或者链接中,这时候过滤起来比较麻烦,在测试的过程中我写了三个方法:

方法一:整体去除,不能去除不被单引号或双引号包含的属性值

这个方法匹配以on开头的属性,比如,过滤的时候会整体滤除这些字符。

方法二:去除属性值


这个方法首先匹配标签,获取元素的全部属性,然后在分析元素的属性,过滤掉以on开头的属性的的值。

比如:<div id=”id1″ >这里是内容</div>

过滤的时候会滤除onclick的值,即alert(‘123’)。

方法三:整体滤除,效果较好

这个方法是方法一的变形,定义一个“组”:ScriptBlock,然后获取到匹配的字符串,然后逐个替换。

通过上边两种方法,基本上可以过滤掉全部的javascript了,还有没有漏网之鱼呢?

诶~,还有一个href中的javascript:

这样就完整了吧。如果还有漏网的,欢迎给我提出来啊。

最后还有几个滤除的方法,和上边的一并贴出来,比如过滤frame、object、html,以及自定义的字符等。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace VeryCodes.Text
{
??? public class StringFilter
??? {
??????? /// <summary>
??????? /// 滤除script引用和区块
??????? /// </summary>
??????? /// <param name="str"></param>
??????? /// <returns></returns>
??????? public static string FilterScript(string str)
??????? {
??????????? string pattern = @"<script[\s\S]+</script *>";
??????????? return StripScriptAttributesFromTags(Regex.Replace(str, pattern, string.Empty, RegexOptions.IgnoreCase));
??????? }

??????? /// <summary>
??????? /// 去除标签中的script属性
??????? /// </summary>
??????? /// <param name="str"></param>
??????? /// <returns></returns>
??????? private static string StripScriptAttributesFromTags(string str)
??????? {
??????????? //\s 空白字符,包括换行符\n、回车符\r、制表符\t、垂直制表符\v、换页符\f
??????????? //\S \s的补集
??????????? //\w 单词字符,指大小写字母、0-9的数字、下划线
??????????? //\W \w的补集

??????????? //方法一:整体去除,不能去除不被单引号或双引号包含的属性值
??????????? //string pattern = @"on\w+=\s*(['""\s]?)([/s/S]*[^\1]*?)\1[\s]*";
??????????? //content = Regex.Replace(str, pattern, string.Empty, RegexOptions.Compiled | RegexOptions.IgnoreCase);

??????????? ////方法二:去除属性值
??????????? //string pattern = @"<\w+\s+(?<Attrs>[^>]*?)[>|/>]";
??????????? //Regex r = new Regex(pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);
??????????? //foreach (Match m in r.Matches(content))
??????????? //{
??????????? //??? //获取标签的全部属性
??????????? //??? string attrs = m.Groups["Attrs"].Value;

??????????? //??? if (!string.IsNullOrEmpty(attrs))
??????????? //??? {
??????????? //??????? //获取每一个属性
??????????? //??????? Regex rt = new Regex(@"(?<AttrName>\w+)\s*=(?<AttrPre>[\s]*(['""\s]?))(?<AttrVal>[^\1]*?)\1", RegexOptions.Compiled | RegexOptions.IgnoreCase);
??????????? //??????? foreach (Match mt in rt.Matches(attrs))
??????????? //??????? {
??????????? //??????????? string attrName = mt.Groups["AttrName"].Value.Trim().ToLower();
??????????? //??????????? string attrVal = mt.Groups["AttrVal"].Value.Trim().ToLower();

??????????? //??????????? //匹配以on开头的属性
??????????? //??????????? if (attrName.StartsWith("on") && !string.IsNullOrEmpty(attrVal))
??????????? //??????????? {
??????????? //??????????????? //将属性值替换为空
??????????? //??????????????? str = str.Replace(attrVal, string.Empty);
??????????? //??????????? }
??????????? //??????? }
??????????? //??? }
??????????? //}

??????????? //整体去除
??????????? string pattern = @"(?<ScriptAttr>on\w+=\s*(['""\s]?)([/s/S]*[^\1]*?)\1)[\s|>|/>]";
??????????? Regex r = new Regex(pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);
??????????? foreach (Match m in r.Matches(str))
??????????? {
??????????????? string attrs = m.Groups["ScriptAttr"].Value;
??????????????? if (!string.IsNullOrEmpty(attrs))
??????????????? {
??????????????????? str = str.Replace(attrs, string.Empty);
??????????????? }
??????????? }

??????????? //滤除包含script的href
??????????? str = FilterHrefScript(str);

??????????? return str;
??????? }

??????? /// <summary>
??????? /// 滤除包含script的href
??????? /// </summary>
??????? /// <param name="str"></param>
??????? /// <returns></returns>
??????? public static string FilterHrefScript(string str)
??????? {
??????????? //整体去除,不能去除不被单引号或双引号包含的属性值
??????????? string regexstr = @" href[ ^=]*=\s*(['""\s]?)[\w]*script+?:([/s/S]*[^\1]*?)\1[\s]*";
??????????? return Regex.Replace(str, regexstr, " ", RegexOptions.IgnoreCase);
??????? }

??????? /// <summary>
??????? /// 滤除src
??????? /// </summary>
??????? /// <param name="str"></param>
??????? /// <returns></returns>
??????? public static string FilterSrc(string str)
??????? {
??????????? //整体去除
??????????? string regexstr = @" src *=\s*(['""\s]?)[^\.]+\.(\w+)\1[\s]*";
??????????? return Regex.Replace(str, regexstr, " ", RegexOptions.IgnoreCase);
??????? }

??????? /// <summary>
??????? /// 滤除Html
??????? /// </summary>
??????? /// <param name="content"></param>
??????? /// <returns></returns>
??????? public static string FilterHtml(string str)
??????? {
??????????? string[] aryReg ={
????????????? @"<style[\s\S]+</style>",
????????????? @"<.*?>",
????????????? @"<(.[^>]*)>",
????????????? @"([\r\n])[\s]+",
????????????? @"&(quot|#34);",
????????????? @"&(amp|#38);",
????????????? @"&(lt|#60);",
????????????? @"&(gt|#62);",
????????????? @"&(nbsp|#160);",
????????????? @"&(iexcl|#161);",
????????????? @"&(cent|#162);",
????????????? @"&(pound|#163);",
????????????? @"&(copy|#169);",
????????????? @"&#(\d+);",
????????????? @"-->",
????????????? @"<!--.*\n"
??????????? };

??????????? string[] aryRep = {
?????????? "",
?????????? "",
?????????? "",
?????????? "",
?????????? "\"",
?????????? "&",
?????????? "<",
?????????? ">",
?????????? " ",
?????????? "\xa1",//chr(161),
?????????? "\xa2",//chr(162),
?????????? "\xa3",//chr(163),
?????????? "\xa9",//chr(169),
?????????? "",
?????????? "\r\n",
?????????? ""
????????? };

??????????? string strOutput = str;
??????????? for (int i = 0; i < aryReg.Length; i++)
??????????? {
??????????????? Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);
??????????????? strOutput = regex.Replace(strOutput, aryRep[i]);
??????????? }

??????????? strOutput = strOutput.Replace("<", "");
??????????? strOutput = strOutput.Replace(">", "");
??????????? strOutput = strOutput.Replace("\r\n", "");

??????????? return strOutput;
??????? }

??????? /// <summary>
??????? /// 过滤object
??????? /// </summary>
??????? /// <param name="content"></param>
??????? /// <returns></returns>
??????? public static string FilterObject(string content)
??????? {
??????????? string regexstr = @"<object[\s\S]+</object *>";
??????????? return Regex.Replace(content, regexstr, string.Empty, RegexOptions.IgnoreCase);
??????? }

??????? /// <summary>
??????? /// 过滤Iframe
??????? /// </summary>
??????? /// <param name="content"></param>
??????? /// <returns></returns>
??????? public static string FilterIframe(string content)
??????? {
??????????? string regexstr = @"<iframe[\s\S]+</iframe *>";
??????????? return Regex.Replace(content, regexstr, string.Empty, RegexOptions.IgnoreCase);
??????? }

??????? /// <summary>
??????? /// 过滤Frameset
??????? /// </summary>
??????? /// <param name="content"></param>
??????? /// <returns></returns>
??????? public static string FilterFrameset(string content)
??????? {
??????????? string regexstr = @"<frameset[\s\S]+</frameset *>";
??????????? return Regex.Replace(content, regexstr, string.Empty, RegexOptions.IgnoreCase);
??????? }

??????? /// <summary>
??????? /// 过滤SQL注入
??????? /// </summary>
??????? /// <returns></returns>
??????? public static string FilterSql(string str)
??????? {
??????????? str = str.Replace("'", "''");
??????????? str = str.Replace("<", "&lt;");
??????????? str = str.Replace(">", "&gt;");

??????????? return str;
??????? }

??????? /// <summary>
??????? /// 移除非法或不友好字符
??????? /// </summary>
??????? /// <param name="keyWord">非法或不友好字符</param>
??????? /// <param name="chkStr">要处理的字符串</param>
??????? /// <returns>处理后的字符串</returns>
??????? public static string FilterBadWords(string keyWord, string chkStr)
??????? {
??????????? if (chkStr == "")
??????????? {
??????????????? return "";
??????????? }
??????????? string[] bwords = keyWord.Split('|');
??????????? int i, j;
??????????? string str;
??????????? StringBuilder sb = new StringBuilder();
??????????? for (i = 0; i < bwords.Length; i++)
??????????? {
??????????????? str = bwords[i].ToString().Trim();
??????????????? string regStr, toStr;
??????????????? regStr = str;
??????????????? Regex r = new Regex(regStr, RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline);
??????????????? Match m = r.Match(chkStr);
??????????????? if (m.Success)
??????????????? {
??????????????????? j = m.Value.Length;
??????????????????? sb.Insert(0, "*", j);
??????????????????? toStr = sb.ToString();
??????????????????? chkStr = Regex.Replace(chkStr, regStr, toStr, RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Multiline);
??????????????? }
??????????????? sb.Remove(0, sb.Length);
??????????? }
??????????? return chkStr;
??????? }
??? }
}
本博客所有文章如无特别注明均为原创。
复制或转载请以超链接形式注明转自波斯码,原文地址《C#使用正则表达式过滤script脚本程序

关键字:

建议订阅本站,及时阅读最新文章!
【上一篇】 【下一篇】