<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>柳城 &#187; 数据分析</title>
	<atom:link href="http://liucheng.name/category/biocompute/data-analysis/feed/" rel="self" type="application/rss+xml" />
	<link>http://liucheng.name</link>
	<description>_SEO&#124;摄影&#124;WordPress&#124;博客</description>
	<lastBuildDate>Tue, 31 Jan 2012 01:56:50 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.3.1</generator>
		<item>
		<title>用perl统计一个六种类型的数据</title>
		<link>http://liucheng.name/1286/</link>
		<comments>http://liucheng.name/1286/#comments</comments>
		<pubDate>Sun, 09 May 2010 04:30:43 +0000</pubDate>
		<dc:creator>柳城</dc:creator>
				<category><![CDATA[数据分析]]></category>
		<category><![CDATA[perl]]></category>
		<category><![CDATA[统计]]></category>

		<guid isPermaLink="false">http://liucheng.name/1286/</guid>
		<description><![CDATA[ rebecca 童鞋还是第一次问我数据处理的问题阿。其实我对这种问题是比较感兴趣的。同一个基因可能有几类转录因子的结合位点，因此，统计不同的情况。幸好最后还是想出了这样一个用hash来完成的办法。<table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="收到《Perl语言编程（第三版）》" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F859%2F&from=http%3A%2F%2Fliucheng.name%2F1286%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1215716.jpg" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">收到《Perl语言编程（第三版）》</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Entrez所有数据库的最新数据统计" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F429%2F&from=http%3A%2F%2Fliucheng.name%2F1286%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1214796.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Entrez所有数据库的最新数据统计</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl grep函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1251%2F&from=http%3A%2F%2Fliucheng.name%2F1286%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl grep函数</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="用Perl抓取网页和提交表格" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1236%2F&from=http%3A%2F%2Fliucheng.name%2F1286%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">用Perl抓取网页和提交表格</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl chdir函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1101%2F&from=http%3A%2F%2Fliucheng.name%2F1286%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl chdir函数</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></description>
			<content:encoded><![CDATA[<p> <a title="由 rebeccajiejie 发表" href="http://yunbio.com/author/rebeccajiejie/">rebecca</a> 童鞋还是第一次问我数据处理的问题阿。其实我对这种问题是比较感兴趣的。</p>
<p>数据是这样子的。<span id="more-1286"></span></p>
<h3>例子文件，只取了一小部分数据：get2_nr.out</h3>
<pre>ZM_BFa0062B01	bzip
ZM_BFa0063A16	bzip
ZM_BFa0063M13	myc
ZM_BFa0063N22	myc
ZM_BFa0063O17	bzip
ZM_BFa0066C13	myb
ZM_BFa0066F21	bzip
ZM_BFa0068F07	bzip
ZM_BFa0068F15	myb
ZM_BFa0068N18	myb
ZM_BFa0068P14	bzip
ZM_BFa0069B08	myb
ZM_BFa0070G13	myb
ZM_BFa0071G19	bzip
ZM_BFa0071G21	myb
ZM_BFa0072O19	bzip
ZM_BFa0073F20	myb
ZM_BFa0073F20	myc
ZM_BFa0073O09	myb
ZM_BFa0073O09	myc
ZM_BFa0078K13	dre
ZM_BFa0078M22	bzip
ZM_BFa0079N03	erf
ZM_BFa0079P04	erf
ZM_BFa0083F19	myb
ZM_BFa0086H17	myb
ZM_BFa0087C10	bzip
ZM_BFa0090M23	bzip
ZM_BFa0090M23	erf
ZM_BFa0090M23	myc
ZM_BFa0097N21	bzip
ZM_BFa0098A07	erf
ZM_BFb0001A15	bzip
ZM_BFb0001A21	abf
ZM_BFb0001A21	erf
ZM_BFb0001A22	bzip
ZM_BFb0001B09	bzip
ZM_BFb0001B09	erf
ZM_BFb0001B13	bzip</pre>
<h4>Rebecca给出的分析是这样子的：</h4>
<p>第一列是基因名称，第二列是转录因子</p>
<p>同一个基因可能有几类转录因子的结合位点，因此，<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/%e7%bb%9f%e8%ae%a1/" title="查看 统计 中的全部文章" target="_blank">统计</a></span>不同的情况</p>
<ul>
<li>1. 一个基因只有一个转录因子，且这个转录因子是bzip。同理，统计，myb，myc，erf，abf，dre，6种情况</li>
<li>2. 一个基因有2个转录因子，那么就可能是6个中任意2个的组合，15种情况</li>
<li>3.一个基因上有3个转录因子，就是6个中任意三个，20种情况</li>
<li>4.一个基因上有4个转录因子，就是15种</li>
<li>5.一个基因上有5个转录因子，6种情况</li>
</ul>
<p>如果枚举的话，貌似太多了</p>
<p><strong>输出的结果是：直接的每种类型的基因个数就行了（即类型-》个数）</strong></p>
<h3>例子代码:temp.pl</h3>
<pre>#!/usr/bin/<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/perl/" title="perl">perl</a></span>

while(&lt;&gt;){
	chop;
	($id,$type) = split("\t");
	chop($type);  #要用两次chop才能把换行符去掉。奇异。。(较旧的<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/perl/" title="查看 perl 中的全部文章" target="_blank">perl</a></span>版本可能得改为chomp($type);)
	$gene{$id} .= $type . "\t"; #丢给hash了
}

undef @arr;
foreach $id (sort keys %gene) {
	@type = sort(split("\t", $gene{$id})); #把type变为数组。用sort函数。避免不必要的重复
	$type = join("-",@type);
	push (@arr,$type); #再把所有的type丢进一个数组里
}

$hash{$_}++ for @arr; #从type的数组里统计重复和个数
print "$_\t$hash{$_}\n" for (keys %hash); #输出</pre>
<p>如果按Rebecca的思路枚举的话。的确是需要更加的代码来完成。</p>
<p>幸好最后还是想出了这样一个用hash来完成的办法。也算是可喜，最主要的参考还是  <span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/perl/" title="查看 perl 中的全部文章" target="_blank">perl</a></span>语言编程 里的一段代码。求平均值的。也有着类似的求解过程。</p>
<div class="similarity"><h3>有点相关的文章</h3><ul><li><a href="http://liucheng.name/1205/">Perl处理Fasta序列的又一实例</a> (0.535)</li><li><a href="http://liucheng.name/1236/">用Perl抓取网页和提交表格</a> (0.535)</li><li><a href="http://liucheng.name/1285/">用perl匹配字符串并返回该匹配的位置</a> (0.535)</li><li><a href="http://liucheng.name/935/">两种办法批量去掉重复数据</a> (0.500)</li><li><a href="http://liucheng.name/1055/">Linux下大文件的排序和去重复</a> (0.500)</li><li><a href="http://liucheng.name/850/">有着一千七百多万个url的sitemap文件 </a> (RANDOM - 0.035)</li></ul></div><!-- Mix --><table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="收到《Perl语言编程（第三版）》" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F859%2F&from=http%3A%2F%2Fliucheng.name%2F1286%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1215716.jpg" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">收到《Perl语言编程（第三版）》</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Entrez所有数据库的最新数据统计" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F429%2F&from=http%3A%2F%2Fliucheng.name%2F1286%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1214796.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Entrez所有数据库的最新数据统计</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl grep函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1251%2F&from=http%3A%2F%2Fliucheng.name%2F1286%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl grep函数</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="用Perl抓取网页和提交表格" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1236%2F&from=http%3A%2F%2Fliucheng.name%2F1286%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">用Perl抓取网页和提交表格</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl chdir函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1101%2F&from=http%3A%2F%2Fliucheng.name%2F1286%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl chdir函数</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></content:encoded>
			<wfw:commentRss>http://liucheng.name/1286/feed/</wfw:commentRss>
		<slash:comments>24</slash:comments>
		</item>
		<item>
		<title>用perl匹配字符串并返回该匹配的位置</title>
		<link>http://liucheng.name/1285/</link>
		<comments>http://liucheng.name/1285/#comments</comments>
		<pubDate>Sat, 08 May 2010 04:30:52 +0000</pubDate>
		<dc:creator>柳城</dc:creator>
				<category><![CDATA[数据分析]]></category>
		<category><![CDATA[perl]]></category>
		<category><![CDATA[正则表达式]]></category>

		<guid isPermaLink="false">http://liucheng.name/1285/</guid>
		<description><![CDATA[这次讲讲perl里跟模式匹配或叫正则表达式有关的东西。比如说，给出一个序列文件，里面都是Fasta格式的序列。 然后序列里面有一些NNNNNN的连续字符。问题就是要得出这些NNNN的一段字符在该序列的具体位置。就是匹配某字符串。<table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="Perl abs函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F998%2F&from=http%3A%2F%2Fliucheng.name%2F1285%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl abs函数</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="用Perl抓取网页和提交表格" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1236%2F&from=http%3A%2F%2Fliucheng.name%2F1285%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">用Perl抓取网页和提交表格</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="PHP 正则表达式" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F115%2F&from=http%3A%2F%2Fliucheng.name%2F1285%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">PHP 正则表达式</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl close函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1170%2F&from=http%3A%2F%2Fliucheng.name%2F1285%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl close函数</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl chomp函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1113%2F&from=http%3A%2F%2Fliucheng.name%2F1285%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl chomp函数</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></description>
			<content:encoded><![CDATA[<p>这次讲讲<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/perl/" title="perl">perl</a></span>里跟模式匹配或叫<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/regular-expression/" title="查看 正则表达式 中的全部文章" target="_blank">正则表达式</a></span>有关的东西。</p>
<p>最近 <a title="由 ghxiao 发表" href="http://yunbio.com/author/ghxiao/">ghxiao</a> 童鞋常常问我用<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/perl/" title="查看 perl 中的全部文章" target="_blank">perl</a></span>处理数据的问题。今天来分享一段代码吧。</p>
<p>比如说，给出一个<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/entrez/" title="序列" target="_blank">序列</a></span>文件，里面都是<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/770/" title="Fasta" target="_blank">Fasta</a></span>格式的序列。 然后序列里面有一些NNNNNN的连续字符。</p>
<p>问题就是要得出这些NNNN的一段字符在该序列的具体位置。（就是匹配某字符串）<span id="more-1285"></span></p>
<h3>例子文件：seq.fasta</h3>
<pre>&gt;CMM_00532
CGCGCGCTGTGCTACGCAGGCCTCTTCCAGGCCCATCTCCCGGCGGCGTGCACCACTACC
AGGATGGTGTGCGTGGGCGGGGGCGCCGCCGAGCTGGTCGCCTTTGCCAGCTTCTTGGGC
GACGACGACGACGACGACGGGGCGCACAGCAGCAGGCGCGGGGAGCTGACGCTNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGTCGACGCGGCGCCGTGGGAGGGCGT
CGCGGACACGGTGCTGCGCGCGCTCACGACGCCGCTGCCGCTGTCCCCAGTCCGGAGCAG
&gt;CMM_00589
ACGGGCGTGTTCCTGGCGTACGGCGGCAGCGACGATGCGCTGCCGGAGGCGGGCCTCGCG
GTGCGCATGAACGACGGGCCTTCGGGCCCTGCGTTTTGGCCGCAGCCGCGCCTGCGGCTC
ATGGAGATGCTGCTGCCGTACCTCGACCAGCACCGCTTCGCGGCCGGCGATATNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCCTGGCATCTGCGGCAGCAGTGGGAC
GTGCCGCGGACGCACGCGTACTACGTGCCGCCCGGCGCCGTGCGGACGGCCGCGCCGCTG
CTGCTCATGGCGGCGACGCGCGACCCCGTGACGCCGTATGCGGCGGCGCGCGCGGCGCTC
&gt;CMM_00662
GCCGTACTCTCCCAGAACGACTTGGCCTCTGCCCGTACCCTCTTTAAAGACAACCTCAAC
CTGACGCCCTATATTGCCTCGACCGAGTGCAGCGGCGTGTGGGCGCGCCGNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGGCCGCCGGACGAGGAGCGCGGCATGGTC
GAGGTCGGGTACGGGATCGACCCGGCGTGCCGGCGGCGGGGACACGCGCGGGCGGCGCTG
&gt;CMM_00942
CTCAACCTGCGCGACGCCGGCGCCGTGGCGGGCAGCGCGATCCCCGCCGGGCGCGTGTAC
CGCTGCGGCACGCTCGAGTACGCGGCCGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNCTCGCCGACTTTGCCGAGGCCGGGGCGTCGCCGCCTGGCGCACGCAGTAC
CTCCACGTCGCGCTGGCGTATGCGCCCACGTTCCGCGCCGTCCTGGAGCATGTGCGCGAC</pre>
<p>只取了一小部分数据。</p>
<h4>分析：</h4>
<ul>
<li>1，CMM_开头的是该序列的ID<br />
2，序列有几行。所以要去掉换行符，先变成一行。这样才能得出正确的位置。</li>
</ul>
<h3>代码例子：temp.pl</h3>
<pre>$/ = "&gt;";

print "id\tstart\tend\tlen\n";  #先输出Title
while(&lt;&gt;){
	if($_ =~ /(CMM.*?)\s(.*)&gt;/ms){ #第一个括号匹配ID。第二个匹配序列
		$id = $1;
		$seq = $2;
		$seq =~ s/\s//g; #把序列里的换行去掉。变成一列
		while ($seq =~ m/(N+)/g) { #匹配一个N或以上的字符
			$len = length($1); #返回这段匹配的长度
			$end = pos($seq); #用pos函数返回该匹配的终止位置
			$start = $end - $len + 1; #计算出起始位置
			print "$id\t$start\t$end\t$len\n"; #输出结果
		}
	}
}</pre>
<pre>运行 <span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/perl/" title="查看 perl 中的全部文章" target="_blank">perl</a></span> temp.pl seq.fasta &gt;output</pre>
<p>输出output文件。</p>
<p>~完。</p>
<div class="similarity"><h3>有点相关的文章</h3><ul><li><a href="http://liucheng.name/1205/">Perl处理Fasta序列的又一实例</a> (0.535)</li><li><a href="http://liucheng.name/1236/">用Perl抓取网页和提交表格</a> (0.535)</li><li><a href="http://liucheng.name/1286/">用perl统计一个六种类型的数据</a> (0.535)</li><li><a href="http://liucheng.name/935/">两种办法批量去掉重复数据</a> (0.500)</li><li><a href="http://liucheng.name/1055/">Linux下大文件的排序和去重复</a> (0.500)</li><li><a href="http://liucheng.name/868/">Perl:FastQ与FastA格式的相互转换</a> (RANDOM - 0.035)</li></ul></div><!-- Mix --><table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="Perl abs函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F998%2F&from=http%3A%2F%2Fliucheng.name%2F1285%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl abs函数</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="用Perl抓取网页和提交表格" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1236%2F&from=http%3A%2F%2Fliucheng.name%2F1285%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">用Perl抓取网页和提交表格</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="PHP 正则表达式" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F115%2F&from=http%3A%2F%2Fliucheng.name%2F1285%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">PHP 正则表达式</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl close函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1170%2F&from=http%3A%2F%2Fliucheng.name%2F1285%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl close函数</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl chomp函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1113%2F&from=http%3A%2F%2Fliucheng.name%2F1285%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl chomp函数</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></content:encoded>
			<wfw:commentRss>http://liucheng.name/1285/feed/</wfw:commentRss>
		<slash:comments>15</slash:comments>
		</item>
		<item>
		<title>用Perl抓取网页和提交表格</title>
		<link>http://liucheng.name/1236/</link>
		<comments>http://liucheng.name/1236/#comments</comments>
		<pubDate>Thu, 25 Feb 2010 06:20:24 +0000</pubDate>
		<dc:creator>柳城</dc:creator>
				<category><![CDATA[数据分析]]></category>
		<category><![CDATA[perl]]></category>

		<guid isPermaLink="false">http://liucheng.name/?p=1236</guid>
		<description><![CDATA[这里简单介绍一下用Perl来实现抓好取网页的源代码，以及用POST的方法来提交表格，并返回结果。难的讲不来，讲讲简单的。use LWP::Simple;use LWP::UserAgent;
<table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="Perl grep函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1251%2F&from=http%3A%2F%2Fliucheng.name%2F1236%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl grep函数</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl closedir函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1176%2F&from=http%3A%2F%2Fliucheng.name%2F1236%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl closedir函数</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="快速提取序列的Perl脚本" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1202%2F&from=http%3A%2F%2Fliucheng.name%2F1236%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">快速提取序列的Perl脚本</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="收到《Perl语言编程（第三版）》" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F859%2F&from=http%3A%2F%2Fliucheng.name%2F1236%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1215716.jpg" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">收到《Perl语言编程（第三版）》</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl:用Net::FTP来上传下载文件" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F852%2F&from=http%3A%2F%2Fliucheng.name%2F1236%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl:用Net::FTP来上传下载文件</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></description>
			<content:encoded><![CDATA[<p>这里简单介绍一下用<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/perl/" title="Perl" target="_blank">Perl</a></span>来实现抓好取网页的源代码，以及用POST的方法来提交表格，并返回结果。难的讲不来，讲讲简单的。</p>
<p>这里讲到的Perl模块有：</p>
<pre>use LWP::Simple;</pre>
<pre>use LWP::UserAgent;</pre>
<p>用<strong><span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/perl/" title="查看 perl 中的全部文章" target="_blank">perl</a></span>doc</strong>查看详细的用法。<span id="more-1236"></span></p>
<h3>1，用<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/perl/" title="perl">perl</a></span>抓取网页</h3>
<p>如果只是要拿到某个网页，那使用 LWP::Simple 里的函数是最简单的。通过调用<strong> get($url) </strong>函数，就可以得到相关网址的内容。</p>
<pre>my $url = 'http://freshair.npr.org/dayFA.cfm?todayDate=current'

use LWP::Simple;
my $content = get $url;
die "Couldn't get $url" unless defined $content;

#  $content 里是网页内容，下面是对此内容作些分析：

if($content =~ m/jazz/i) {
print "They're talking about jazz today on Fresh Air!\n";
} else {
print "Fresh Air is apparently jazzless today.\n";
}</pre>
<p>非常简单易懂。拿网页内容是容易的，难的是用正则过滤需要的内容。</p>
<h3>2，通过 POST提交表格</h3>
<p>部分HTML表格使用HTML POST 向服务器提交数据，在这里你可以这样：</p>
<pre>$response = $browser-&gt;post( $url,
   [
     formkey1 =&gt; value1,
     formkey2 =&gt; value2,
     ...
   ],
 );</pre>
<p>实例分析：例如在(<a href="http://www.enzim.hu/hmmtop/html/submit.html">http://www.enzim.hu/hmmtop/html/submit.html</a>)提交一段<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/entrez/" title="序列" target="_blank">序列</a></span>并返回结果，用perl来实现。代码如下：</p>
<pre>#!/usr/bin/perl

use LWP::UserAgent;
my $browser = LWP::UserAgent-&gt;new;
$protein = "MSSSTPFDPYALSEHDEERPQNVQSKSRTAELQAEIDDTVGIMRDNINKVAERGERLTSI";
my $SUSUI_URL = "http://www.enzim.hu/hmmtop/server/hmmtop.cgi";
my $response = $browser-&gt;post( $SUSUI_URL,    [ 'if' =&gt; $protein, ]  );

if ($response-&gt;is_success) {
	print $response-&gt;content;
} else {
	print "Bad luck this time\n";
}</pre>
<p>通过分析<a href="http://www.enzim.hu/hmmtop/html/submit.html">http://www.enzim.hu/hmmtop/html/submit.html</a>的页面可知，这个要提交的input只有一个，就是<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/" title="柳城">name</a></span>="if"。$protein就是要提交的序列。$response-&gt;content就是返回结果。</p>
<div class="similarity"><h3>有点相关的文章</h3><ul><li><a href="http://liucheng.name/1205/">Perl处理Fasta序列的又一实例</a> (1.000)</li><li><a href="http://liucheng.name/1285/">用perl匹配字符串并返回该匹配的位置</a> (1.000)</li><li><a href="http://liucheng.name/1286/">用perl统计一个六种类型的数据</a> (1.000)</li><li><a href="http://liucheng.name/477/">如何用perl处理测序文件</a> (0.500)</li><li><a href="http://liucheng.name/496/">perl常用的内置特殊变量</a> (0.500)</li><li><a href="http://liucheng.name/1202/">快速提取序列的Perl脚本</a> (RANDOM - 0.500)</li></ul></div><!-- Mix --><table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="Perl grep函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1251%2F&from=http%3A%2F%2Fliucheng.name%2F1236%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl grep函数</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl closedir函数" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1176%2F&from=http%3A%2F%2Fliucheng.name%2F1236%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl closedir函数</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="快速提取序列的Perl脚本" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1202%2F&from=http%3A%2F%2Fliucheng.name%2F1236%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">快速提取序列的Perl脚本</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="收到《Perl语言编程（第三版）》" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F859%2F&from=http%3A%2F%2Fliucheng.name%2F1236%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1215716.jpg" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">收到《Perl语言编程（第三版）》</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl:用Net::FTP来上传下载文件" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F852%2F&from=http%3A%2F%2Fliucheng.name%2F1236%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl:用Net::FTP来上传下载文件</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></content:encoded>
			<wfw:commentRss>http://liucheng.name/1236/feed/</wfw:commentRss>
		<slash:comments>21</slash:comments>
		</item>
		<item>
		<title>Perl处理Fasta序列的又一实例</title>
		<link>http://liucheng.name/1205/</link>
		<comments>http://liucheng.name/1205/#comments</comments>
		<pubDate>Wed, 13 Jan 2010 01:41:12 +0000</pubDate>
		<dc:creator>柳城</dc:creator>
				<category><![CDATA[数据分析]]></category>
		<category><![CDATA[Fasta]]></category>
		<category><![CDATA[perl]]></category>

		<guid isPermaLink="false">http://liucheng.name/?p=1205</guid>
		<description><![CDATA[今天再来分享一小段Perl脚本，其实是比较简单的。主要是用来处理Fasta格式的序列。分割fasta文件的perl脚本，我需要把一个fasta文件按两条序列一组分成若干个fasta文件。<table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="快速提取序列的Perl脚本" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1202%2F&from=http%3A%2F%2Fliucheng.name%2F1205%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">快速提取序列的Perl脚本</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl:FastQ与FastA格式的相互转换" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F868%2F&from=http%3A%2F%2Fliucheng.name%2F1205%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl:FastQ与FastA格式的相互转换</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="如何用perl处理测序文件" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F477%2F&from=http%3A%2F%2Fliucheng.name%2F1205%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">如何用perl处理测序文件</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Fasta格式的详细说明" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F770%2F&from=http%3A%2F%2Fliucheng.name%2F1205%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Fasta格式的详细说明</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="收到《Perl语言编程（第三版）》" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F859%2F&from=http%3A%2F%2Fliucheng.name%2F1205%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1215716.jpg" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">收到《Perl语言编程（第三版）》</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></description>
			<content:encoded><![CDATA[<p>今天再来分享一小段<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/perl/" title="Perl" target="_blank">Perl</a></span>脚本，其实是比较简单的。主要是用来处理<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/770/" title="Fasta" target="_blank">Fasta</a></span>格式的<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/entrez/" title="序列" target="_blank">序列</a></span>。</p>
<h4>网友niche Says:</h4>
<blockquote><p>你好，你有没有分割fasta文件的<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/perl/" title="perl">perl</a></span>脚本，我需要把一个fasta文件按两条序列一组分成若干个fasta文件，谢谢！</p></blockquote>
<p>如果不懂fasta文件，可以再看一下<strong>解释</strong>：<span id="more-1205"></span><br />
或是查看: <a href="http://liucheng.name/770/" target="_blank">Fasta格式的详细说明</a></p>
<pre>&gt;cel-miR-1 MIMAT0000003 Caenorhabditis elegans miR-1
UGGAAUGUAAAGAAGUAUGUA
&gt;cel-miR-2 MIMAT0000004 Caenorhabditis elegans miR-2
UAUCACAGCCAGCUUUGAUGUGCUAUCACAGCCAGCUUUG
UAUCACAGCCAGCUUUGAUGUGC
……</pre>
<p>其中标识符就是大于号'&gt;'。按两个为一组分成若干个文件。大意上是这样。</p>
<h3>分割fasta文件</h3>
<pre>#!/usr/bin/<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/perl/" title="查看 perl 中的全部文章" target="_blank">perl</a></span>

#fasta2.pl
#Usage:perl fasta2.pl in_fasta out_file

open(IN,"&lt;$ARGV[0]");
$i = 0;
$j = 1;
while(&lt;IN&gt;){
    if(/^&gt;/){
        $i++;
    }
    if($i == 3){
        $j++;
        $i = 1;
    }
    open(OUT,"&gt;&gt;$ARGV[1]_$j");
    print OUT $_;
}</pre>
<p><strong>注：</strong> in_fasta是指要处理的fasta文件。</p>
<p>           out_file是指输出的文件。（如命名为out, 则生成的文件名为out_1, out_2, out_3等）</p>
<p>主要是利用循环嘛，第一步是按大于号'&gt;'来统计个数。再用$j来循环输出文件名。</p>
<div class="similarity"><h3>有点相关的文章</h3><ul><li><a href="http://liucheng.name/1236/">用Perl抓取网页和提交表格</a> (0.565)</li><li><a href="http://liucheng.name/1285/">用perl匹配字符串并返回该匹配的位置</a> (0.565)</li><li><a href="http://liucheng.name/1286/">用perl统计一个六种类型的数据</a> (0.565)</li><li><a href="http://liucheng.name/868/">Perl:FastQ与FastA格式的相互转换</a> (0.500)</li><li><a href="http://liucheng.name/935/">两种办法批量去掉重复数据</a> (0.500)</li><li><a href="http://liucheng.name/1160/">两种方法查看文件的行数</a> (RANDOM - 0.500)</li></ul></div><!-- Mix --><table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="快速提取序列的Perl脚本" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1202%2F&from=http%3A%2F%2Fliucheng.name%2F1205%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">快速提取序列的Perl脚本</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Perl:FastQ与FastA格式的相互转换" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F868%2F&from=http%3A%2F%2Fliucheng.name%2F1205%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Perl:FastQ与FastA格式的相互转换</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="如何用perl处理测序文件" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F477%2F&from=http%3A%2F%2Fliucheng.name%2F1205%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">如何用perl处理测序文件</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Fasta格式的详细说明" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F770%2F&from=http%3A%2F%2Fliucheng.name%2F1205%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Fasta格式的详细说明</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="收到《Perl语言编程（第三版）》" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F859%2F&from=http%3A%2F%2Fliucheng.name%2F1205%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1215716.jpg" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">收到《Perl语言编程（第三版）》</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></content:encoded>
			<wfw:commentRss>http://liucheng.name/1205/feed/</wfw:commentRss>
		<slash:comments>20</slash:comments>
		</item>
		<item>
		<title>两种方法查看文件的行数</title>
		<link>http://liucheng.name/1160/</link>
		<comments>http://liucheng.name/1160/#comments</comments>
		<pubDate>Fri, 04 Dec 2009 09:12:11 +0000</pubDate>
		<dc:creator>柳城</dc:creator>
				<category><![CDATA[数据分析]]></category>
		<category><![CDATA[Excel]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[wc]]></category>

		<guid isPermaLink="false">http://liucheng.name/?p=1160</guid>
		<description><![CDATA[介绍两种我常用的计算行数的方法，Excel的方法及linux命令的方法。打开Excel，在右下角的地方点右键，有“平均值”、“计数”、“计数值”、“最大值”、“求和”等。在Linux下用wc进行计数。返回文件的行数、字数、字节数等。<table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="Linux下大文件的排序和去重复" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1055%2F&from=http%3A%2F%2Fliucheng.name%2F1160%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Linux下大文件的排序和去重复</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="linux:crontab命令用法" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F578%2F&from=http%3A%2F%2Fliucheng.name%2F1160%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">linux:crontab命令用法</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="BioPerl安装指南:Unix/Linux/Windows下的安装" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F765%2F&from=http%3A%2F%2Fliucheng.name%2F1160%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1214174.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">BioPerl安装指南:Unix/Linux/Windows下的安装</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Linux：FTP全部命令 使用方法介绍" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F350%2F&from=http%3A%2F%2Fliucheng.name%2F1160%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Linux：FTP全部命令 使用方法介绍</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="linux:awk中的NR,FNR" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F579%2F&from=http%3A%2F%2Fliucheng.name%2F1160%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">linux:awk中的NR,FNR</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></description>
			<content:encoded><![CDATA[<p>对于我们所操作的文件或是数据，行数是一个最常用的值。最后的统计结果当中，这个行数也是差不多作为一个必需项出现的，因为行数在大部分情况下，就是代表着总数。</p>
<p>我每天工作都是在接触两种系统：XP和<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/linux/" title="Linux" target="_blank">Linux</a></span>。所以介绍两种我常用的<strong>计算行数的方法，<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/excel/" title="查看 Excel 中的全部文章" target="_blank">Excel</a></span>的方法及linux命令的方法</strong>。<span id="more-1160"></span></p>
<h3>1，用<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/excel/" title="查看 Excel 中的全部文章" target="_blank">Excel</a></span>查看行数</h3>
<p>打开Excel，在右下角的地方点右键，有“平均值”、“计数”、“计数值”、“最大值”、“求和”等。选择“计数”，因为“计算”是最常用的。这里的“计数”可以计算行数，也可以计算列数。计数的范围就是鼠标选择的范围。</p>
<p style="text-align: center;"> <a href="http://liucheng.name/wp-content/uploads/2009/12/excel_wc.png" class="highslide-image" onclick="return hs.expand(this);"><img class="size-full wp-image-1161 aligncenter" title="用Excel查看行数" src="http://liucheng.name/wp-content/uploads/2009/12/excel_wc.png" alt="用Excel查看行数" width="223" height="183" /></a></p>
<p>“计数”与“计数值”的区别：“计数值”是指计算仅是数字的数值，不包括字符串。“计数”就没有这限制。</p>
<p><strong>缺点：</strong>只有一个值时无法计算。这在数据大时会引起一些问题。例如误删等情况。需要特别留意。具体看图：</p>
<p style="text-align: center;"><a href="http://liucheng.name/wp-content/uploads/2009/12/excel_wc_2.png" class="highslide-image" onclick="return hs.expand(this);"><img class="size-full wp-image-1162 aligncenter" title="两种方法查看文件的行数" src="http://liucheng.name/wp-content/uploads/2009/12/excel_wc_2.png" alt="两种方法查看文件的行数" width="237" height="211" /></a><br />
<a href="http://liucheng.name/wp-content/uploads/2009/12/excel_wc_1.png" class="highslide-image" onclick="return hs.expand(this);"><img class="aligncenter size-medium wp-image-1163" title="两种方法查看文件的行数" src="http://liucheng.name/wp-content/uploads/2009/12/excel_wc_1-300x246.png" alt="两种方法查看文件的行数" width="240" height="197" /></a>
</p>
<p style="text-align: center;">只有一个值时无法计数</p>
<h3>2，用<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/linux/" title="查看 Linux 中的全部文章" target="_blank">Linux</a></span>的<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/wc/" title="查看 wc 中的全部文章" target="_blank">wc</a></span>命令</h3>
<p>在Linux下用<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/wc/" title="查看 wc 中的全部文章" target="_blank">wc</a></span>进行计数。返回文件的行数、字数、字节数等。</p>
<p><strong>看个例子：</strong></p>
<pre>wc wc1.txt
3  5 16 wc1.txt
输出信息依次是：行数 字数 字节数 文件名称。</pre>
<p>再具体点，单个统计。</p>
<pre>wc -m filename：显示一个文件的字符数
wc -l filename：显示一个文件的行数
wc -L filename：显示一个文件中的最长行的长度
wc -w filename：显示一个文件的字数</pre>
<p><strong>需要留意的：</strong>貌似wc统计的行算是用换行符来确定的。就是说最后一行要有换行符，最后wc的行数才是正确的，否则将会少一行。</p>
<p>为了说明这个问题，看一个<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/perl/" title="perl">perl</a></span>的测试：</p>
<pre>perl -e 'print "a"'|wc
      0       1       1
perl -e 'print "a\n"'|wc
      1       1       2</pre>
<p>够清楚了吧。</p>
<div class="similarity"><h3>有点相关的文章</h3><ul><li><a href="http://liucheng.name/1055/">Linux下大文件的排序和去重复</a> (1.000)</li><li><a href="http://liucheng.name/203/">AWK详细参考(转载整理)</a> (0.500)</li><li><a href="http://liucheng.name/206/">使用linux中的sed编辑器</a> (0.500)</li><li><a href="http://liucheng.name/209/">IBM文章:Shell、Shell 脚本编写、命令行、相关工具及技巧</a> (0.500)</li><li><a href="http://liucheng.name/350/">Linux：FTP全部命令 使用方法介绍</a> (0.500)</li><li><a href="http://liucheng.name/578/">linux:crontab命令用法</a> (RANDOM - 0.500)</li></ul></div><!-- Mix --><table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="Linux下大文件的排序和去重复" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F1055%2F&from=http%3A%2F%2Fliucheng.name%2F1160%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Linux下大文件的排序和去重复</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="linux:crontab命令用法" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F578%2F&from=http%3A%2F%2Fliucheng.name%2F1160%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">linux:crontab命令用法</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="BioPerl安装指南:Unix/Linux/Windows下的安装" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F765%2F&from=http%3A%2F%2Fliucheng.name%2F1160%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1214174.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">BioPerl安装指南:Unix/Linux/Windows下的安装</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Linux：FTP全部命令 使用方法介绍" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F350%2F&from=http%3A%2F%2Fliucheng.name%2F1160%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Linux：FTP全部命令 使用方法介绍</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="linux:awk中的NR,FNR" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F579%2F&from=http%3A%2F%2Fliucheng.name%2F1160%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">linux:awk中的NR,FNR</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></content:encoded>
			<wfw:commentRss>http://liucheng.name/1160/feed/</wfw:commentRss>
		<slash:comments>17</slash:comments>
		</item>
		<item>
		<title>Linux下大文件的排序和去重复</title>
		<link>http://liucheng.name/1055/</link>
		<comments>http://liucheng.name/1055/#comments</comments>
		<pubDate>Tue, 20 Oct 2009 02:20:31 +0000</pubDate>
		<dc:creator>柳城</dc:creator>
				<category><![CDATA[数据分析]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[sort]]></category>
		<category><![CDATA[uniq]]></category>
		<category><![CDATA[去重复]]></category>

		<guid isPermaLink="false">http://liucheng.name/?p=1055</guid>
		<description><![CDATA[Linux下我们用 sort 与 uniq 的命令来实现去重复行。但有时碰到一个大文件时（例如G级的文件），用上面的命令时报错，提示空间不足。我尝试了一下，最后是用 split 命令把大文件分割为几个小文件，单独排完序后再合并 uniq 。<table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="linux:awk中的NR,FNR" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F579%2F&from=http%3A%2F%2Fliucheng.name%2F1055%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">linux:awk中的NR,FNR</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="BioPerl安装指南:Unix/Linux/Windows下的安装" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F765%2F&from=http%3A%2F%2Fliucheng.name%2F1055%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1214174.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">BioPerl安装指南:Unix/Linux/Windows下的安装</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="linux:crontab命令用法" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F578%2F&from=http%3A%2F%2Fliucheng.name%2F1055%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">linux:crontab命令用法</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Linux：FTP全部命令 使用方法介绍" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F350%2F&from=http%3A%2F%2Fliucheng.name%2F1055%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Linux：FTP全部命令 使用方法介绍</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="使用linux中的sed编辑器" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F206%2F&from=http%3A%2F%2Fliucheng.name%2F1055%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">使用linux中的sed编辑器</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></description>
			<content:encoded><![CDATA[<p><span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/linux/" title="Linux" target="_blank">Linux</a></span>下我们用 <span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/sort/" title="查看 sort 中的全部文章" target="_blank">sort</a></span> 与 <span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/uniq/" title="查看 uniq 中的全部文章" target="_blank">uniq</a></span> 的命令来实现<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/%e5%8e%bb%e9%87%8d%e5%a4%8d/" title="查看 去重复 中的全部文章" target="_blank">去重复</a></span>行。</p>
<h3>去重复行</h3>
<pre>简单的用法如下，如一个文件名：happybirthday.txt

cat happybirthday.txt (显示文件内容)

Happy Birthday to You!
Happy Birthday to You!
Happy Birthday Dear Tux!
Happy Birthday to You!

cat happybirthday.txt|sort （排序）

Happy Birthday Dear Tux!
Happy Birthday to You!
Happy Birthday to You!
Happy Birthday to You!

cat happybirthday.txt|sort|<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/uniq/" title="查看 uniq 中的全部文章" target="_blank">uniq</a></span> (去重复行)

Happy Birthday Dear Tux!
Happy Birthday to You!</pre>
<p><span id="more-1055"></span></p>
<h3>去大文件重复行</h3>
<p>但有时碰到一个大文件时（例如G级的文件），用上面的命令时报错，提示空间不足。我尝试了一下，最后是用 split 命令把大文件分割为几个小文件，单独排完序后再合并 uniq 。</p>
<pre>split -b 200m  happybirthday.big Prefix_

用-b参数切割happybirthday.big，小文件为200M。切割后的文件名前缀是Prefix_</pre>
<p><strong>切割后的文件名如</strong></p>
<pre>Prefix_aa

Prefix_ab</pre>
<p><strong>再分别sort</strong></p>
<pre>sort Prefix_aa &gt;Prefix_aa.sort

sort Prefix_ab &gt;Prefix_ab.sort</pre>
<p><strong>再用 sort -m合并，再 uniq</strong></p>
<pre>cat Prefix_aa.sort Prefix_ab.sort |sort -m |uniq</pre>
<p><strong>上面的代码排序后还是不满意的话，可尝试下面的（2010-07-12更新）：</strong></p>
<pre>sort -sm Prefix_aa.sort Prefix_ab.sort|uniq</pre>
<p>这是好早前碰到的一个问题了。没记错的话应该是这么回事。~</p>
<p>sort 与 uniq 命令还有许多有用的参数，如sort -m、uniq -u、uniq -d等。sort 与 uniq的组合是很强大的。</p>
<p>~完。</p>
<div class="similarity"><h3>有点相关的文章</h3><ul><li><a href="http://liucheng.name/1160/">两种方法查看文件的行数</a> (1.000)</li><li><a href="http://liucheng.name/203/">AWK详细参考(转载整理)</a> (0.500)</li><li><a href="http://liucheng.name/206/">使用linux中的sed编辑器</a> (0.500)</li><li><a href="http://liucheng.name/209/">IBM文章:Shell、Shell 脚本编写、命令行、相关工具及技巧</a> (0.500)</li><li><a href="http://liucheng.name/350/">Linux：FTP全部命令 使用方法介绍</a> (0.500)</li><li><a href="http://liucheng.name/578/">linux:crontab命令用法</a> (RANDOM - 0.500)</li></ul></div><!-- Mix --><table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="linux:awk中的NR,FNR" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F579%2F&from=http%3A%2F%2Fliucheng.name%2F1055%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">linux:awk中的NR,FNR</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="BioPerl安装指南:Unix/Linux/Windows下的安装" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F765%2F&from=http%3A%2F%2Fliucheng.name%2F1055%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1214174.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">BioPerl安装指南:Unix/Linux/Windows下的安装</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="linux:crontab命令用法" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F578%2F&from=http%3A%2F%2Fliucheng.name%2F1055%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">linux:crontab命令用法</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Linux：FTP全部命令 使用方法介绍" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F350%2F&from=http%3A%2F%2Fliucheng.name%2F1055%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Linux：FTP全部命令 使用方法介绍</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="使用linux中的sed编辑器" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F206%2F&from=http%3A%2F%2Fliucheng.name%2F1055%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">使用linux中的sed编辑器</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></content:encoded>
			<wfw:commentRss>http://liucheng.name/1055/feed/</wfw:commentRss>
		<slash:comments>13</slash:comments>
		</item>
		<item>
		<title>两种办法批量去掉重复数据</title>
		<link>http://liucheng.name/935/</link>
		<comments>http://liucheng.name/935/#comments</comments>
		<pubDate>Wed, 02 Sep 2009 12:46:16 +0000</pubDate>
		<dc:creator>柳城</dc:creator>
				<category><![CDATA[Biocompute]]></category>
		<category><![CDATA[数据分析]]></category>
		<category><![CDATA[重复数据]]></category>

		<guid isPermaLink="false">http://www.liucheng.name/?p=935</guid>
		<description><![CDATA[这是对大批量数据而言的，数据少的话其实没多大意思。我喜欢大批量的数据。对于大批量的数据，最好用的工具其实还是Linux，如果是稍微少一些的话，直接用Excle就行了。这里介绍两种办法，去掉重复的数据。……<table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="【生物信息学教程】7.3：大规模基因表达谱数据分析方法" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F520%2F&from=http%3A%2F%2Fliucheng.name%2F935%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">【生物信息学教程】7.3：大规模基因表达谱数据分析方法</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Entrez所有数据库的最新数据统计" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F429%2F&from=http%3A%2F%2Fliucheng.name%2F935%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1214796.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Entrez所有数据库的最新数据统计</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Bioperl:对序列数据进行操作" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F398%2F&from=http%3A%2F%2Fliucheng.name%2F935%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Bioperl:对序列数据进行操作</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="RefSeq数据库最新版的统计数据" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F413%2F&from=http%3A%2F%2Fliucheng.name%2F935%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">RefSeq数据库最新版的统计数据</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="系统进化树构建及数据分析的简介" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F577%2F&from=http%3A%2F%2Fliucheng.name%2F935%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">系统进化树构建及数据分析的简介</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></description>
			<content:encoded><![CDATA[<p>这是对大批量数据而言的，数据少的话其实没多大意思。我喜欢大批量的数据。对于大批量的数据，最好用的工具其实还是<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/linux/" title="Linux" target="_blank">Linux</a></span>，如果是稍微少一些的话，直接用Excle就行了。</p>
<p>这里介绍两种办法，去掉重复的数据。说之前来复习一下我喜欢的一句话：<span class='wp_keywordlink'><a href="http://liucheng.name/" title="柳城博客" target="_blank">柳城博客</a></span>(Lc.)， <strong>努力在数据的海洋里畅游</strong>。</p>
<h3>1，用Excle，适合不算太大量的数据</h3>
<p>如果是用Excle，太大的数据打开会有问题的。打开十几M的大小的Excle都够吃力的。如果电脑内存差些，那更加惨。不过，这种情况是适合大部分人的。<span id="more-935"></span></p>
<p style="text-align: center;"><a href="http://liucheng.name/wp-content/uploads/2009/09/Deduplication.png" class="highslide-image" onclick="return hs.expand(this);"><img class="aligncenter size-full wp-image-937" title="两种办法批量去掉重复数据" src="http://liucheng.name/wp-content/uploads/2009/09/Deduplication.png" alt="两种办法批量去掉重复数据" width="288" height="190" /></a></p>
<p style="text-align: center;"><a href="http://liucheng.name/wp-content/uploads/2009/09/Deduplication2.png" class="highslide-image" onclick="return hs.expand(this);"><img class="aligncenter size-full wp-image-936" title="两种办法批量去掉重复数据" src="http://liucheng.name/wp-content/uploads/2009/09/Deduplication2.png" alt="两种办法批量去掉重复数据" width="277" height="234" /></a></p>
<p> </p>
<h3>2，用Linux，sort与uniq命令</h3>
<p>假设数据放在一个文件，取名file.txt。</p>
<pre>cat file.txt | sort | uniq &gt;newfile.txt</pre>
<p>这样就是去掉<span class='wp_keywordlink_affiliate'><a href="http://liucheng.name/tag/%e9%87%8d%e5%a4%8d%e6%95%b0%e6%8d%ae/" title="查看 重复数据 中的全部文章" target="_blank">重复数据</a></span>，并输出到一个新的文件newfile.txt</p>
<p>简单吧。</p>
<div class="similarity"><h3>有点相关的文章</h3><ul><li><a href="http://liucheng.name/520/">【生物信息学教程】7.3：大规模基因表达谱数据分析方法</a> (0.500)</li><li><a href="http://liucheng.name/1055/">Linux下大文件的排序和去重复</a> (0.454)</li><li><a href="http://liucheng.name/1160/">两种方法查看文件的行数</a> (0.454)</li><li><a href="http://liucheng.name/1205/">Perl处理Fasta序列的又一实例</a> (0.454)</li><li><a href="http://liucheng.name/1236/">用Perl抓取网页和提交表格</a> (0.454)</li><li><a href="http://liucheng.name/1218/">Perl split函数</a> (RANDOM - 0.047)</li></ul></div><!-- Mix --><table class="wumii-related-items" cellspacing="0" cellpadding="3" border="0"  style="clear: both;">
    
    <tr>
        <td colspan="5"><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">无觅猜您也喜欢：</font></b></td>
    </tr>
    
        <tr>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important;">
                    <a target="_blank" title="【生物信息学教程】7.3：大规模基因表达谱数据分析方法" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F520%2F&from=http%3A%2F%2Fliucheng.name%2F935%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">【生物信息学教程】7.3：大规模基因表达谱数据分析方法</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Entrez所有数据库的最新数据统计" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F429%2F&from=http%3A%2F%2Fliucheng.name%2F935%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/site_images/2010/12/10/1214796.png" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Entrez所有数据库的最新数据统计</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="Bioperl:对序列数据进行操作" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F398%2F&from=http%3A%2F%2Fliucheng.name%2F935%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">Bioperl:对序列数据进行操作</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="RefSeq数据库最新版的统计数据" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F413%2F&from=http%3A%2F%2Fliucheng.name%2F935%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">RefSeq数据库最新版的统计数据</font>
                    </a>
                </td>
                <td width="102" valign="top" style="padding: 5px !important; margin: 0 !important; border-left: 1px solid #DDDDDD !important;">
                    <a target="_blank" title="系统进化树构建及数据分析的简介" style="text-decoration: none !important; cursor: pointer !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fliucheng.name%2F577%2F&from=http%3A%2F%2Fliucheng.name%2F935%2F">
                        <img style="margin: 0 !important; padding: 2px !important; border: 1px solid #DDDDDD !important; width: 96px !important; height: 96px !important;" src="http://static.wumii.com/images/blogWidget/wordpress_default.gif" width="96px" height="96px" /><br />
                        <font size="-1" color="#333333" style="display: block !important; line-height: 15px !important; width: 102px !important; font: 12px/15px arial !important; height: 60px !important; margin: 3px 0 0 0 !important; padding: 0 !important; overflow: hidden !important;">系统进化树构建及数据分析的简介</font>
                    </a>
                </td>
        </tr>
    
    <tr>
        <td colspan="5" align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></content:encoded>
			<wfw:commentRss>http://liucheng.name/935/feed/</wfw:commentRss>
		<slash:comments>5</slash:comments>
		</item>
	</channel>
</rss>

