複数のファイルから必要なキーを取り出してCSVファイルに書き出す処理時間を短縮するには
複数のファイルの中からそれぞれ必要なキーを抽出して1つのcsvファイルにリストを書き出す処理をしています。
- 元のデータは1ファイル辺り約800行(40kB程度)が約18万件、トータルで約8GBほど
- 必要なデータを取り出した結果ファイルは10MB程度になる
動作の確認等は出来たのですが処理に時間がかかりすぎていて、先輩がPerlで作ったものだと約240秒ほどで処理が完了するのですが、私がC#で書いたコードだと処理が完了するのに約1時間も時間がかかってしまいます。まだC#を勉強し始めて間もないですがコードの中に無駄な処理があるのか変なループがあるのかが分かりません。
この処理を最短に時間短縮出来るような書き方があれば教えていただきたいです。
宜しくお願い致します。
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace ConsoleApp6
{
class DatRowValues
{
public string ProcessData { get; set; }
public string KeyValue { get; set; }
public string IntValue { get; set; }
public string StringValue { get; set; }
public string Value { get; set; }
}
class NewDatRowValues
{
public string EqpId { get; set; }
public string LotId { get; set; }
public string WaferId { get; set; }
public DateTime SDate { get; set; }
public string TempBRINECoolant { get; set; }
}
class NewDatRowValuesMapper : CsvHelper.Configuration.ClassMap<NewDatRowValues>
{
public NewDatRowValuesMapper()
{
Map(x => x.EqpId).Index(0);
Map(x => x.LotId).Index(1);
Map(x => x.WaferId).Index(2);
Map(x => x.SDate).Index(3).TypeConverterOption.Format("yyyy/MM/dd HH:mm:ss");
Map(x => x.TempBRINECoolant).Index(4);
}
}
class Program
{
static void Main(string[] args)
{
// 書き出し用の入れ物を用意
var writeDatList = new List<NewDatRowValues>();
// 読み込み
foreach (string fileName in Directory.GetFiles(@"C:\20190403", "*.dat"))
using (var sr = new StreamReader(fileName, System.Text.Encoding.GetEncoding("shift_jis")))
using (var inputDat = new CsvHelper.CsvReader(sr))
{
inputDat.Configuration.HasHeaderRecord = false;
// 必要なキーとなる行のみ抽出
var dat = inputDat.GetRecords<DatRowValues>();
var targetRows = dat.Where(r =>
r.KeyValue == "EQP_ID" ||
r.KeyValue == "LOT_ID" ||
r.KeyValue == "WAFER_ID" ||
r.KeyValue == "S_DATE" ||
r.KeyValue == "TempBRINECoolant");
// それぞれの値を格納
var newRow = new NewDatRowValues();
foreach (var row in targetRows)
{
if (row.KeyValue == "EQP_ID")
{
newRow.EqpId = row.StringValue;
}
if (row.KeyValue == "LOT_ID")
{
newRow.LotId = row.StringValue;
}
if (row.KeyValue == "WAFER_ID")
{
newRow.WaferId = row.StringValue;
}
if (row.KeyValue == "S_DATE")
{
newRow.SDate = DateTime.Parse(row.StringValue);
}
if (row.KeyValue == "TempBRINECoolant")
{
newRow.TempBRINECoolant = row.StringValue;
}
}
writeDatList.Add(newRow);
}
// 書き出し
using (var sw = new StreamWriter(@"C:\テスト\list1.csv"))
using (var outputDat = new CsvHelper.CsvWriter(sw))
{
var writingList = writeDatList.GroupBy(r => r.EqpId.Substring(0, 4))
.Where(g => g.Count() > 1)
.SelectMany(g => g)
.ToList();
outputDat.Configuration.HasHeaderRecord = false;
outputDat.Configuration.RegisterClassMap<NewDatRowValuesMapper>();
outputDat.WriteRecords(writingList);
}
}
}
}
元データ (1ファイルの中身)
\\,AACZ12501_93G25701901,93G257019-18,TSN.PR,TSN-LCT,AACZ12501,2019/04/04 00:00:31,実処理データ
ProcessData,LOT_ID,3,AP0077130.00C,
ProcessData,LOT_ID_SUB,3,AP0077130.00,
ProcessData,LOT_NO,3,AP0077130,
ProcessData,WAFER_ID,3,AP0077130.18,
ProcessData,WAFER_NO,1,18,
ProcessData,PRODSPEC_ID,3,T6BD60001-00001.00,
ProcessData,PRODGRP_ID,3,T6BD6,
ProcessData,PRODGRP_BIND,3,T6BD6,
ProcessData,MAIN_MAINPD_ID,3,A5L501PC.00,
ProcessData,MAINPD_ID,3,A5L501PC.00,
ProcessData,FLOW_TYPE,3,Main,
ProcessData,FLOW_TYPE_NO,1,1,
ProcessData,D_SEQNO,1,89,
ProcessData,OP_NO,3,TSN CT Coat.MA1,
ProcessData,OP_NO_NAME,3,本処理,
ProcessData,PD_IDENT,3,KTSNIMA1.00,
ProcessData,PD_IDENT_NAME,3,COAT,
ProcessData,EQP_GROUP_CODE,3,PCOT,
ProcessData,EQP_GROUP_NAME,3,RESIST C/T,
ProcessData,EQP_GROUP_BIND,3,PCOT,
ProcessData,EQP_ID,3,PCOT003,
ProcessData,PH_RECIPE_ID,3,084,
ProcessData,RCP_NAME_SPACE,3,PEPPR,
ProcessData,LC_RECIPE_ID,3,V146G-420-10+AQ7.00,
ProcessData,RECIPE_ID,3,PEPPR.084,
ProcessData,S_DATE,4,2019/04/03 23:48:08,
ProcessData,E_DATE,4,2019/04/04 00:00:31,
ProcessData,CAST_ID,3,PA0-01239,
ProcessData,SLOT_NO,1,18,
ProcessData,DEPT_CODE,3,DEPT,
ProcessData,HIST_S_DATE_1,4,2019/04/03 23:48:08,
ProcessData,HIST_E_DATE_1,4,2019/04/04 00:00:31,
ProcessData,Clock_C,3,2019040400003155,
ProcessData,EventName_C,3,STS At Destination,
ProcessData,SubstID_C,3,AP0077130.18,
ProcessData,ProcessJobID_C,3,AP0077130.01,
ProcessData,PPID_C,3,RegFlowRcpClass/084,
ProcessData,ControlJobID_C,3,PCOT003-20190403-0053,
ProcessData,WaferSequenceNo_C,3,18,
ProcessData,SubstProcState_C,3,2,
ProcessData,_TCT-02_Cup temp.,2,23.07,
ProcessData,_TCT-02_Cup humidity,2,45.26,
ProcessData,_TCT-02_Resist temp.,2,23.00,
ProcessData,_TCT-02_Motor flange temp.,2,23.00,
ProcessData,_TCT-02_Solvent bath flow,2,0.0,
ProcessData,_TCT-02_Back rinse flow 1,2,0.0,
ProcessData,_TCT-02_Back rinse flow 2,2,0.0,
ProcessData,_TCT-02_Drain case Rinse,2,0.0,
ProcessData,_TCT-02_Back rinse1+2 flow,2,64.8,
ProcessData,_TCT-02_Side rinse flow,2,4.8,
ProcessData,_TCT-02_Cup wind velocity,2,0.38,
ProcessData,_COT-02_Cup temp.,2,23.08,
ProcessData,_COT-02_Cup humidity,2,45.31,
ProcessData,_COT-02_Resist temp.,2,23.02,
ProcessData,_COT-02_Motor flange temp.,2,23.00,
ProcessData,_COT-02_Solvent bath flow,2,0.0,
ProcessData,_COT-02_RRC Nozzle flow,2,2.5,
ProcessData,_COT-02_Drain case Rinse,2,0.0,
ProcessData,_COT-02_Back rinse1+2 flow,2,81.7,
ProcessData,_COT-02_Side rinse flow,2,21.4,
ProcessData,_COT-02_Cup wind velocity,2,0.40,
ProcessData,_ADH-02_Plate temp.,2,100.03,
ProcessData,_ADH-02_HMDS flow,2,5553.9,
ProcessData,_CPL-03_Plate temp.,2,24.00,
ProcessData,_CPL-05_Plate temp.,2,23.00,
ProcessData,_PHP-03_Plate temp.,2,109.99,
ProcessData,_PHP-03_Plate temp. 1,2,109.98,
ProcessData,_PHP-03_Plate temp. 2,2,110.02,
ProcessData,_PHP-03_Plate temp. 3,2,110.02,
ProcessData,_PHP-03_Plate temp. 4,2,110.01,
ProcessData,_PHP-03_Plate temp. 5,2,109.99,
ProcessData,_PHP-03_Plate temp. 6,2,109.98,
ProcessData,_PHP-03_Plate temp. 7,2,109.99,
ProcessData,SlotStatus_C,3,3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 0 0 0 0 0,
ProcessData,PortID_C,3,1,
ProcessData,LotID_C,3,AP0077130.00C,
ProcessData,SubstDestination_C,3,PA0-01239.18,
ProcessData,SubstLocID1_C,3,PA0-01239.18,
ProcessData,Timein1_C,3,2019040323394290,
ProcessData,Timeout1_C,3,2019040323480827,
ProcessData,SubstLocID2_C,3,[2-05]TRS01,
ProcessData,Timein2_C,3,2019040323481433,
ProcessData,Timeout2_C,3,2019040323482607,
ProcessData,SubstLocID3_C,3,[2-15]ADH02,
ProcessData,Timein3_C,3,2019040323482957,
ProcessData,Timeout3_C,3,2019040323493000,
ProcessData,SubstLocID4_C,3,[2-21]CPL05,
ProcessData,Timein4_C,3,2019040323493567,
ProcessData,Timeout4_C,3,2019040323503524,
ProcessData,SubstLocID5_C,3,[2-02]COT02,
ProcessData,Timein5_C,3,2019040323504071,
ProcessData,Timeout5_C,3,2019040323514101,
ProcessData,SubstLocID6_C,3,[2-24]PHP03,
ProcessData,Timein6_C,3,2019040323514584,
ProcessData,Timeout6_C,3,2019040323533013,
ProcessData,SubstLocID7_C,3,[2-17]CPL03,
ProcessData,Timein7_C,3,2019040323533561,
ProcessData,Timeout7_C,3,2019040323563468,
ProcessData,SubstLocID8_C,3,[2-04]TCT02,
ProcessData,Timein8_C,3,2019040323564008,
ProcessData,Timeout8_C,3,2019040400001710,
ProcessData,SubstLocID9_C,3,[2-06]TRS02,
ProcessData,Timein9_C,3,2019040400002061,
ProcessData,Timeout9_C,3,2019040400002521,
ProcessData,SubstMtrlStatus_C,3,0,
ProcessData,SubstSource_C,3,PA0-01239.18,
ProcessData,SubstState_C,3,2,
ProcessData,SubstType_C,3,0,
ProcessData,SubstUsage_C,3,0,
ProcessData,CLOCK1_C,3,2019040323480827,
ProcessData,CLOCK2_C,3,2019040400003153,
出力後のcsvファイルの中身(一部)は以下のようになっています。
PCOT003 AP0077130.00C AP0077130.18 2019/4/3 23:48
PCOT004 AP0077164.00C AP0077164.16 2019/4/3 23:49
PCOT004 AP0077164.00C AP0077164.17 2019/4/3 23:50
PCOT008 AP0076967.00C AP0076967.01 2019/4/3 23:56
PCOT001 SP0008774.00C SP0008774.02 2019/4/3 23:50
PCOT002 SP0009131.00C SP0009131.03 2019/4/3 23:53
PCOT002 SP0009131.00C SP0009131.02 2019/4/3 23:53
PCOT001 SP0008774.00C SP0008774.03 2019/4/3 23:50
PCOT008 AP0076967.00C AP0076967.03 2019/4/3 23:56
PCOT008 AP0076967.00C AP0076967.02 2019/4/3 23:56
PCOT004 AP0077164.00C AP0077164.18 2019/4/3 23:50
PCOT002 SP0009131.00C SP0009131.04 2019/4/3 23:54
PCOT008 AP0076967.00C AP0076967.04 2019/4/3 23:56
PCOT004 AP0077164.00C AP0077164.20 2019/4/3 23:52
PCOT004 AP0077164.00C AP0077164.19 2019/4/3 23:51
PCOT003 AP0077130.00C AP0077130.19 2019/4/3 23:48
PCOT002 SP0009131.00C SP0009131.06 2019/4/3 23:55
PCOT002 SP0009131.00C SP0009131.05 2019/4/3 23:54
PCOT001 SP0008774.00C SP0008774.05 2019/4/3 23:50
1ファイルから1行出力しています。
以下Perlのコードを頂きましたので参考程度に載せておきます。
use strict;
use warnings;
my $dirname = 'C:\Users\0020316094\Desktop\Perl\development_1\data';
my @list;
my $count = 0;
my @Lot;
my @Waf;
my @Eqp;
my @Date;
my @lot_id;
my @waf_id;
my @eqp_id;
my @date_id;
my $start_time = time;
opendir(DIR, $dirname) or die "$dirname: $!";
while (my $dir = readdir (DIR)) {
next if $dir eq '.' || $dir eq '..' || $dir eq 'test.txt' || $dir eq 'file_get.pl'; #.を除外する処理
push @list, $dir; #配列に入れる
}
closedir (DIR);
foreach (@list) {
# open(FILE, "<", "C:\Users\0020316094\Desktop\Perl\development_1\data\$list[$count]") or die "$!";
open(FILE, "<", "$list[$count]") or die "$!"; #修正前
while (my $line = <FILE>) {
@lot_id = split(/,/, $line) if $line =~ /,LOT_ID,/;
$Lot[$count] = $lot_id[3];
@waf_id = split(/,/, $line) if $line =~ /,WAFER_ID,/;
$Waf[$count] = $waf_id[3];
@eqp_id = split(/,/, $line) if $line =~ /,EQP_ID,/;
$Eqp[$count] = $eqp_id[3];
@date_id = split(/,/, $line) if $line =~ /,S_DATE,/;
$Date[$count] = $date_id[3];
}
close (FILE);
$count++
}
my $count1 = 0;
open(TXT, ">>test.txt") or die "$!";
foreach (@Lot) {
print TXT "$Lot[$count1],$Waf[$count1],$Eqp[$count1],$Date[$count1]\n";
$count1++
}
close (TXT);