正好有时间所以用C#写了一段正则表达式作用是删除 Page 里面Code 中的 HTML标签这在做采集信息消除其中的HTML很有用处
以下是引用片段
public string checkStr(string html)
{
SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@<script[\s\S]+</script *> SystemTextRegularExpressionsRegexOptionsIgnoreCase);
SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@ href *= *[\s\S]*script *: SystemTextRegularExpressionsRegexOptionsIgnoreCase);
SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@ no[\s\S]*= SystemTextRegularExpressionsRegexOptionsIgnoreCase);
SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@<iframe[\s\S]+</iframe *> SystemTextRegularExpressionsRegexOptionsIgnoreCase);
SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@<frameset[\s\S]+</frameset *> SystemTextRegularExpressionsRegexOptionsIgnoreCase);
SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@\<img[^\>]+\> SystemTextRegularExpressionsRegexOptionsIgnoreCase);
SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@</p> SystemTextRegularExpressionsRegexOptionsIgnoreCase);
SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@<p> SystemTextRegularExpressionsRegexOptionsIgnoreCase);
SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@<[^>]*> SystemTextRegularExpressionsRegexOptionsIgnoreCase);
html = regexReplace(html ); //过滤<script></script>标记
html = regexReplace(html ); //过滤href=JavaScript: (<A>) 属性
html = regexReplace(html _disibledevent=); //过滤其它控件的on事件
html = regexReplace(html ); //过滤iframe
html = regexReplace(html ); //过滤frameset
html = regexReplace(html ); //过滤frameset
html = regexReplace(html ); //过滤frameset
html = regexReplace(html ); //过滤frameset
html = regexReplace(html );
html = htmlReplace( );
html = htmlReplace(</strong> );
html = htmlReplace(<strong> );
return html;
}