SlideShare une entreprise Scribd logo
1  sur  42
Télécharger pour lire hors ligne
Step 1: TF모델을 TRT 포맷으로 변환
Step 2: 모델 Parser 생성
Step 3: 입/출력 레이어 정보 입력
Step 4: 모델의 최적화 및
런타임 Engine 생성
Step 5: 엔진을 파일로 저장
Step 6: 엔진을 파일에서 읽음
Step 7: Inference 수행
•
•
•
•
•
•
•
•
•
PReLUPlugin::PReLUPlugin(const Weights *weights, int nbWeights) {
mWeights = weights[0];
mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
memcpy(const_cast<void *>(mWeights.values), weights[0].values, mWeights.count * type2size(mWeights.type));
}
int PReLUPlugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace,
cudaStream_t stream) {
const float zerof{0.0f}; const __half zeroh = fp16::__float2half(0.0f);
if (mWeights.type == DataType::__float) {
CHECK(Forward_gpu<__float>(batchSize * mNbInputCount, mNbInputChannels,
mNbInputHeight * mNbInputHeight, reinterpret_cast<const __float *>(mDeviceKernel),
reinterpret_cast<const __float *>(inputs[0]), reinterpret_cast<__float *>(outputs[0]),
zerof, mChannelShared ? mNbInputChannels : 1, stream));
} else { // DataType::kFLOAT }
return 0;
}
template <typename Ftype>
__global__ void PReLUForward(const int n, const int channels, const int dim, const Ftype* slope_data, const
Ftype* in, Ftype* out, const Ftype zero, const int div_factor) {
CUDA_KERNEL_LOOP(index, n) {
int c = (index / dim) % channels / div_factor;
out[index] = (in[index] > (Ftype(zero))) ? in[index] :
in[index] * *(reinterpret_cast<const Ftype*>(slope_data)+c);
}
}
template <typename Ftype>
cudaError_t Forward_gpu(const int count, const int channels, const int dim, const Ftype* mDeviceKernel,
const Ftype* bottom_data, Ftype* top_data, const Ftype zero, const int div_factor, const cudaStream_t stream) {
PReLUForward<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>
(count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor);
return cudaGetLastError();
}
IPluginExt *PReLUPlugin::clone() const override {
return new PReLUPlugin(&mWeights, 1);
}
IPlugin* pluginFactory::createPlugin(const char* layerName, const Weights* serialData, int nbWeights) override {
return new PReLUPlugin(serialData, serialLength);
}
PluginFactory parserPluginFactory;
parser->setPluginFactoryExt(&parserPluginFactory);
const IBlobNameToTensor *blobNameToTensor =
parser->parse(gParams.deployFile.c_str(), // caffe deploy file
gParams.modelFile.c_str(), // caffe model file
*network, // network definition that the parser will populate
gParams.fp16 ? DataType::kHALF : DataType::kFLOAT);
builder->setMaxBatchSize(gParams.batchSize);
builder->setMaxWorkspaceSize(size_t(gParams.workspaceSize) << 20);
builder->setFp16Mode(gParams.fp16);
ICudaEngine* engine = builder->buildCudaEngine(*network);
void PReLUPlugin::serialize(void *buffer) {
char *d = static_cast<char *>(buffer), *a = d;
write(d, mNbInputChannels); write(d, mNbInputHeight); write(d, mNbInputWidth); write(d, mNbInputCount);
write(d, mChannelShared); write(d, mWeights.count); write(d, mWeights.type);
convertAndCopyToBuffer(d, mWeights);
assert(d == a + getSerializationSize());
}
PReLUPlugin::PReLUPlugin(const void *data, size_t length) {
const char *d = static_cast<const char *>(data), *a = d;
read<int>(d, mNbInputChannels); read<int>(d, mNbInputHeight); read<int>(d, mNbInputWidth);
read<int>(d, mNbInputCount); read<bool>(d, mChannelShared); read<int64_t>(d, mWeights.count);
read<DataType>(d, mWeights.type);
mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
memcpy(const_cast<void *>(mWeights.values), d, mWeights.count * type2size(mWeights.type));
deserializeToDevice(d, mDeviceKernel, mWeights.count * type2size(mWeights.type));
assert(d == a + length);
}
Iplugin *PluginFactory::createPlugin(const char *layerName, const void *serialData, size_t serialLength) override
{
return new PReLUPlugin(serialData, serialLength);
}
PluginFactory pluginFactory;
engine = infer->deserializeCudaEngine(trt_plan_file, size, &pluginFactory);
cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_SIZE * sizeof(float),
cudaMemcpyHostToDevice, stream);
context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float),
cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
cudaStreamCreate(&stream));
IExecutionContext* context = engine->createExecutionContext();
[232] TensorRT를 활용한 딥러닝 Inference 최적화
[232] TensorRT를 활용한 딥러닝 Inference 최적화
[232] TensorRT를 활용한 딥러닝 Inference 최적화
[232] TensorRT를 활용한 딥러닝 Inference 최적화
[232] TensorRT를 활용한 딥러닝 Inference 최적화
[232] TensorRT를 활용한 딥러닝 Inference 최적화

Contenu connexe

Tendances

組み込みでこそC++を使う10の理由
組み込みでこそC++を使う10の理由組み込みでこそC++を使う10の理由
組み込みでこそC++を使う10の理由kikairoya
 
Docker入門 - 基礎編 いまから始めるDocker管理
Docker入門 - 基礎編 いまから始めるDocker管理Docker入門 - 基礎編 いまから始めるDocker管理
Docker入門 - 基礎編 いまから始めるDocker管理Masahito Zembutsu
 
何となく勉強した気分になれるパーサ入門
何となく勉強した気分になれるパーサ入門何となく勉強した気分になれるパーサ入門
何となく勉強した気分になれるパーサ入門masayoshi takahashi
 
中3女子が狂える本当に気持ちのいい constexpr
中3女子が狂える本当に気持ちのいい constexpr中3女子が狂える本当に気持ちのいい constexpr
中3女子が狂える本当に気持ちのいい constexprGenya Murakami
 
C/C++プログラマのための開発ツール
C/C++プログラマのための開発ツールC/C++プログラマのための開発ツール
C/C++プログラマのための開発ツールMITSUNARI Shigeo
 
tcpdumpとtcpreplayとtcprewriteと他。
tcpdumpとtcpreplayとtcprewriteと他。tcpdumpとtcpreplayとtcprewriteと他。
tcpdumpとtcpreplayとtcprewriteと他。(^-^) togakushi
 
Xbyakの紹介とその周辺
Xbyakの紹介とその周辺Xbyakの紹介とその周辺
Xbyakの紹介とその周辺MITSUNARI Shigeo
 
A Sneak Peek of MLIR in TensorFlow
A Sneak Peek of MLIR in TensorFlowA Sneak Peek of MLIR in TensorFlow
A Sneak Peek of MLIR in TensorFlowKoan-Sin Tan
 
Docker入門-基礎編 いまから始めるDocker管理【2nd Edition】
Docker入門-基礎編 いまから始めるDocker管理【2nd Edition】Docker入門-基礎編 いまから始めるDocker管理【2nd Edition】
Docker入門-基礎編 いまから始めるDocker管理【2nd Edition】Masahito Zembutsu
 
BoostAsioで可読性を求めるのは間違っているだろうか
BoostAsioで可読性を求めるのは間違っているだろうかBoostAsioで可読性を求めるのは間違っているだろうか
BoostAsioで可読性を求めるのは間違っているだろうかYuki Miyatake
 
Streaming replication in practice
Streaming replication in practiceStreaming replication in practice
Streaming replication in practiceAlexey Lesovsky
 
組み込み関数(intrinsic)によるSIMD入門
組み込み関数(intrinsic)によるSIMD入門組み込み関数(intrinsic)によるSIMD入門
組み込み関数(intrinsic)によるSIMD入門Norishige Fukushima
 
TensorFlow XLAの可能性
TensorFlow XLAの可能性 TensorFlow XLAの可能性
TensorFlow XLAの可能性 Mr. Vengineer
 
Effective Modern C++ 勉強会#3 Item16
Effective Modern C++ 勉強会#3 Item16Effective Modern C++ 勉強会#3 Item16
Effective Modern C++ 勉強会#3 Item16Mitsuru Kariya
 
Intro to SVE 富岳のA64FXを触ってみた
Intro to SVE 富岳のA64FXを触ってみたIntro to SVE 富岳のA64FXを触ってみた
Intro to SVE 富岳のA64FXを触ってみたMITSUNARI Shigeo
 
Yocto bspを作ってみた
Yocto bspを作ってみたYocto bspを作ってみた
Yocto bspを作ってみたwata2ki
 

Tendances (20)

組み込みでこそC++を使う10の理由
組み込みでこそC++を使う10の理由組み込みでこそC++を使う10の理由
組み込みでこそC++を使う10の理由
 
Docker入門 - 基礎編 いまから始めるDocker管理
Docker入門 - 基礎編 いまから始めるDocker管理Docker入門 - 基礎編 いまから始めるDocker管理
Docker入門 - 基礎編 いまから始めるDocker管理
 
何となく勉強した気分になれるパーサ入門
何となく勉強した気分になれるパーサ入門何となく勉強した気分になれるパーサ入門
何となく勉強した気分になれるパーサ入門
 
Doubly Linked List
Doubly Linked ListDoubly Linked List
Doubly Linked List
 
中3女子が狂える本当に気持ちのいい constexpr
中3女子が狂える本当に気持ちのいい constexpr中3女子が狂える本当に気持ちのいい constexpr
中3女子が狂える本当に気持ちのいい constexpr
 
C/C++プログラマのための開発ツール
C/C++プログラマのための開発ツールC/C++プログラマのための開発ツール
C/C++プログラマのための開発ツール
 
Map
MapMap
Map
 
tcpdumpとtcpreplayとtcprewriteと他。
tcpdumpとtcpreplayとtcprewriteと他。tcpdumpとtcpreplayとtcprewriteと他。
tcpdumpとtcpreplayとtcprewriteと他。
 
Xbyakの紹介とその周辺
Xbyakの紹介とその周辺Xbyakの紹介とその周辺
Xbyakの紹介とその周辺
 
A Sneak Peek of MLIR in TensorFlow
A Sneak Peek of MLIR in TensorFlowA Sneak Peek of MLIR in TensorFlow
A Sneak Peek of MLIR in TensorFlow
 
Docker入門-基礎編 いまから始めるDocker管理【2nd Edition】
Docker入門-基礎編 いまから始めるDocker管理【2nd Edition】Docker入門-基礎編 いまから始めるDocker管理【2nd Edition】
Docker入門-基礎編 いまから始めるDocker管理【2nd Edition】
 
Vespa 機能紹介 #yjmu
Vespa 機能紹介 #yjmuVespa 機能紹介 #yjmu
Vespa 機能紹介 #yjmu
 
BoostAsioで可読性を求めるのは間違っているだろうか
BoostAsioで可読性を求めるのは間違っているだろうかBoostAsioで可読性を求めるのは間違っているだろうか
BoostAsioで可読性を求めるのは間違っているだろうか
 
Streaming replication in practice
Streaming replication in practiceStreaming replication in practice
Streaming replication in practice
 
組み込み関数(intrinsic)によるSIMD入門
組み込み関数(intrinsic)によるSIMD入門組み込み関数(intrinsic)によるSIMD入門
組み込み関数(intrinsic)によるSIMD入門
 
HTTP/2, QUIC入門
HTTP/2, QUIC入門HTTP/2, QUIC入門
HTTP/2, QUIC入門
 
TensorFlow XLAの可能性
TensorFlow XLAの可能性 TensorFlow XLAの可能性
TensorFlow XLAの可能性
 
Effective Modern C++ 勉強会#3 Item16
Effective Modern C++ 勉強会#3 Item16Effective Modern C++ 勉強会#3 Item16
Effective Modern C++ 勉強会#3 Item16
 
Intro to SVE 富岳のA64FXを触ってみた
Intro to SVE 富岳のA64FXを触ってみたIntro to SVE 富岳のA64FXを触ってみた
Intro to SVE 富岳のA64FXを触ってみた
 
Yocto bspを作ってみた
Yocto bspを作ってみたYocto bspを作ってみた
Yocto bspを作ってみた
 

Similaire à [232] TensorRT를 활용한 딥러닝 Inference 최적화

服务框架: Thrift & PasteScript
服务框架: Thrift & PasteScript服务框架: Thrift & PasteScript
服务框架: Thrift & PasteScriptQiangning Hong
 
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docxIn Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docxbradburgess22840
 
Protocol handler in Gecko
Protocol handler in GeckoProtocol handler in Gecko
Protocol handler in GeckoChih-Hsuan Kuo
 
Kapacitor - Real Time Data Processing Engine
Kapacitor - Real Time Data Processing EngineKapacitor - Real Time Data Processing Engine
Kapacitor - Real Time Data Processing EnginePrashant Vats
 
Instruction1. Please read the two articles. (Kincheloe part 1 &.docx
Instruction1. Please read the two articles. (Kincheloe part 1 &.docxInstruction1. Please read the two articles. (Kincheloe part 1 &.docx
Instruction1. Please read the two articles. (Kincheloe part 1 &.docxcarliotwaycave
 
Php 5.6 From the Inside Out
Php 5.6 From the Inside OutPhp 5.6 From the Inside Out
Php 5.6 From the Inside OutFerenc Kovács
 
33rd Degree 2013, Bad Tests, Good Tests
33rd Degree 2013, Bad Tests, Good Tests33rd Degree 2013, Bad Tests, Good Tests
33rd Degree 2013, Bad Tests, Good TestsTomek Kaczanowski
 
如何透過 Go-kit 快速搭建微服務架構應用程式實戰
如何透過 Go-kit 快速搭建微服務架構應用程式實戰如何透過 Go-kit 快速搭建微服務架構應用程式實戰
如何透過 Go-kit 快速搭建微服務架構應用程式實戰KAI CHU CHUNG
 
How Many Ohs? (An Integration Guide to Apex & Triple-o)
How Many Ohs? (An Integration Guide to Apex & Triple-o)How Many Ohs? (An Integration Guide to Apex & Triple-o)
How Many Ohs? (An Integration Guide to Apex & Triple-o)OPNFV
 
How Does Kubernetes Build OpenAPI Specifications?
How Does Kubernetes Build OpenAPI Specifications?How Does Kubernetes Build OpenAPI Specifications?
How Does Kubernetes Build OpenAPI Specifications?reallavalamp
 
The Ring programming language version 1.5.3 book - Part 8 of 184
The Ring programming language version 1.5.3 book - Part 8 of 184The Ring programming language version 1.5.3 book - Part 8 of 184
The Ring programming language version 1.5.3 book - Part 8 of 184Mahmoud Samir Fayed
 
VPN Access Runbook
VPN Access RunbookVPN Access Runbook
VPN Access RunbookTaha Shakeel
 
NodeJs
NodeJsNodeJs
NodeJsdizabl
 
The Ring programming language version 1.5.4 book - Part 8 of 185
The Ring programming language version 1.5.4 book - Part 8 of 185The Ring programming language version 1.5.4 book - Part 8 of 185
The Ring programming language version 1.5.4 book - Part 8 of 185Mahmoud Samir Fayed
 
2012 JDays Bad Tests Good Tests
2012 JDays Bad Tests Good Tests2012 JDays Bad Tests Good Tests
2012 JDays Bad Tests Good TestsTomek Kaczanowski
 
MT_01_unittest_python.pdf
MT_01_unittest_python.pdfMT_01_unittest_python.pdf
MT_01_unittest_python.pdfHans Jones
 
Where the wild things are - Benchmarking and Micro-Optimisations
Where the wild things are - Benchmarking and Micro-OptimisationsWhere the wild things are - Benchmarking and Micro-Optimisations
Where the wild things are - Benchmarking and Micro-OptimisationsMatt Warren
 
Create a JAVA program that performs file IO and database interaction.pdf
Create a JAVA program that performs file IO and database interaction.pdfCreate a JAVA program that performs file IO and database interaction.pdf
Create a JAVA program that performs file IO and database interaction.pdfmalavshah9013
 

Similaire à [232] TensorRT를 활용한 딥러닝 Inference 최적화 (20)

服务框架: Thrift & PasteScript
服务框架: Thrift & PasteScript服务框架: Thrift & PasteScript
服务框架: Thrift & PasteScript
 
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docxIn Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
 
Protocol handler in Gecko
Protocol handler in GeckoProtocol handler in Gecko
Protocol handler in Gecko
 
Kapacitor - Real Time Data Processing Engine
Kapacitor - Real Time Data Processing EngineKapacitor - Real Time Data Processing Engine
Kapacitor - Real Time Data Processing Engine
 
Instruction1. Please read the two articles. (Kincheloe part 1 &.docx
Instruction1. Please read the two articles. (Kincheloe part 1 &.docxInstruction1. Please read the two articles. (Kincheloe part 1 &.docx
Instruction1. Please read the two articles. (Kincheloe part 1 &.docx
 
Php 5.6 From the Inside Out
Php 5.6 From the Inside OutPhp 5.6 From the Inside Out
Php 5.6 From the Inside Out
 
33rd Degree 2013, Bad Tests, Good Tests
33rd Degree 2013, Bad Tests, Good Tests33rd Degree 2013, Bad Tests, Good Tests
33rd Degree 2013, Bad Tests, Good Tests
 
如何透過 Go-kit 快速搭建微服務架構應用程式實戰
如何透過 Go-kit 快速搭建微服務架構應用程式實戰如何透過 Go-kit 快速搭建微服務架構應用程式實戰
如何透過 Go-kit 快速搭建微服務架構應用程式實戰
 
How Many Ohs? (An Integration Guide to Apex & Triple-o)
How Many Ohs? (An Integration Guide to Apex & Triple-o)How Many Ohs? (An Integration Guide to Apex & Triple-o)
How Many Ohs? (An Integration Guide to Apex & Triple-o)
 
Microkernel Development
Microkernel DevelopmentMicrokernel Development
Microkernel Development
 
How Does Kubernetes Build OpenAPI Specifications?
How Does Kubernetes Build OpenAPI Specifications?How Does Kubernetes Build OpenAPI Specifications?
How Does Kubernetes Build OpenAPI Specifications?
 
The Ring programming language version 1.5.3 book - Part 8 of 184
The Ring programming language version 1.5.3 book - Part 8 of 184The Ring programming language version 1.5.3 book - Part 8 of 184
The Ring programming language version 1.5.3 book - Part 8 of 184
 
VPN Access Runbook
VPN Access RunbookVPN Access Runbook
VPN Access Runbook
 
NodeJs
NodeJsNodeJs
NodeJs
 
The Ring programming language version 1.5.4 book - Part 8 of 185
The Ring programming language version 1.5.4 book - Part 8 of 185The Ring programming language version 1.5.4 book - Part 8 of 185
The Ring programming language version 1.5.4 book - Part 8 of 185
 
2012 JDays Bad Tests Good Tests
2012 JDays Bad Tests Good Tests2012 JDays Bad Tests Good Tests
2012 JDays Bad Tests Good Tests
 
MT_01_unittest_python.pdf
MT_01_unittest_python.pdfMT_01_unittest_python.pdf
MT_01_unittest_python.pdf
 
Where the wild things are - Benchmarking and Micro-Optimisations
Where the wild things are - Benchmarking and Micro-OptimisationsWhere the wild things are - Benchmarking and Micro-Optimisations
Where the wild things are - Benchmarking and Micro-Optimisations
 
Keras and TensorFlow
Keras and TensorFlowKeras and TensorFlow
Keras and TensorFlow
 
Create a JAVA program that performs file IO and database interaction.pdf
Create a JAVA program that performs file IO and database interaction.pdfCreate a JAVA program that performs file IO and database interaction.pdf
Create a JAVA program that performs file IO and database interaction.pdf
 

Plus de NAVER D2

[211] 인공지능이 인공지능 챗봇을 만든다
[211] 인공지능이 인공지능 챗봇을 만든다[211] 인공지능이 인공지능 챗봇을 만든다
[211] 인공지능이 인공지능 챗봇을 만든다NAVER D2
 
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...NAVER D2
 
[215] Druid로 쉽고 빠르게 데이터 분석하기
[215] Druid로 쉽고 빠르게 데이터 분석하기[215] Druid로 쉽고 빠르게 데이터 분석하기
[215] Druid로 쉽고 빠르게 데이터 분석하기NAVER D2
 
[245]Papago Internals: 모델분석과 응용기술 개발
[245]Papago Internals: 모델분석과 응용기술 개발[245]Papago Internals: 모델분석과 응용기술 개발
[245]Papago Internals: 모델분석과 응용기술 개발NAVER D2
 
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈NAVER D2
 
[235]Wikipedia-scale Q&A
[235]Wikipedia-scale Q&A[235]Wikipedia-scale Q&A
[235]Wikipedia-scale Q&ANAVER D2
 
[244]로봇이 현실 세계에 대해 학습하도록 만들기
[244]로봇이 현실 세계에 대해 학습하도록 만들기[244]로봇이 현실 세계에 대해 학습하도록 만들기
[244]로봇이 현실 세계에 대해 학습하도록 만들기NAVER D2
 
[243] Deep Learning to help student’s Deep Learning
[243] Deep Learning to help student’s Deep Learning[243] Deep Learning to help student’s Deep Learning
[243] Deep Learning to help student’s Deep LearningNAVER D2
 
[234]Fast & Accurate Data Annotation Pipeline for AI applications
[234]Fast & Accurate Data Annotation Pipeline for AI applications[234]Fast & Accurate Data Annotation Pipeline for AI applications
[234]Fast & Accurate Data Annotation Pipeline for AI applicationsNAVER D2
 
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load BalancingOld version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load BalancingNAVER D2
 
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지NAVER D2
 
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기NAVER D2
 
[224]네이버 검색과 개인화
[224]네이버 검색과 개인화[224]네이버 검색과 개인화
[224]네이버 검색과 개인화NAVER D2
 
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)NAVER D2
 
[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기
[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기
[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기NAVER D2
 
[213] Fashion Visual Search
[213] Fashion Visual Search[213] Fashion Visual Search
[213] Fashion Visual SearchNAVER D2
 
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지NAVER D2
 
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터NAVER D2
 
[223]기계독해 QA: 검색인가, NLP인가?
[223]기계독해 QA: 검색인가, NLP인가?[223]기계독해 QA: 검색인가, NLP인가?
[223]기계독해 QA: 검색인가, NLP인가?NAVER D2
 
[231] Clova 화자인식
[231] Clova 화자인식[231] Clova 화자인식
[231] Clova 화자인식NAVER D2
 

Plus de NAVER D2 (20)

[211] 인공지능이 인공지능 챗봇을 만든다
[211] 인공지능이 인공지능 챗봇을 만든다[211] 인공지능이 인공지능 챗봇을 만든다
[211] 인공지능이 인공지능 챗봇을 만든다
 
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...
 
[215] Druid로 쉽고 빠르게 데이터 분석하기
[215] Druid로 쉽고 빠르게 데이터 분석하기[215] Druid로 쉽고 빠르게 데이터 분석하기
[215] Druid로 쉽고 빠르게 데이터 분석하기
 
[245]Papago Internals: 모델분석과 응용기술 개발
[245]Papago Internals: 모델분석과 응용기술 개발[245]Papago Internals: 모델분석과 응용기술 개발
[245]Papago Internals: 모델분석과 응용기술 개발
 
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈
 
[235]Wikipedia-scale Q&A
[235]Wikipedia-scale Q&A[235]Wikipedia-scale Q&A
[235]Wikipedia-scale Q&A
 
[244]로봇이 현실 세계에 대해 학습하도록 만들기
[244]로봇이 현실 세계에 대해 학습하도록 만들기[244]로봇이 현실 세계에 대해 학습하도록 만들기
[244]로봇이 현실 세계에 대해 학습하도록 만들기
 
[243] Deep Learning to help student’s Deep Learning
[243] Deep Learning to help student’s Deep Learning[243] Deep Learning to help student’s Deep Learning
[243] Deep Learning to help student’s Deep Learning
 
[234]Fast & Accurate Data Annotation Pipeline for AI applications
[234]Fast & Accurate Data Annotation Pipeline for AI applications[234]Fast & Accurate Data Annotation Pipeline for AI applications
[234]Fast & Accurate Data Annotation Pipeline for AI applications
 
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load BalancingOld version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing
 
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지
 
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기
 
[224]네이버 검색과 개인화
[224]네이버 검색과 개인화[224]네이버 검색과 개인화
[224]네이버 검색과 개인화
 
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)
 
[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기
[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기
[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기
 
[213] Fashion Visual Search
[213] Fashion Visual Search[213] Fashion Visual Search
[213] Fashion Visual Search
 
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지
 
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터
 
[223]기계독해 QA: 검색인가, NLP인가?
[223]기계독해 QA: 검색인가, NLP인가?[223]기계독해 QA: 검색인가, NLP인가?
[223]기계독해 QA: 검색인가, NLP인가?
 
[231] Clova 화자인식
[231] Clova 화자인식[231] Clova 화자인식
[231] Clova 화자인식
 

Dernier

Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)Wonjun Hwang
 
WordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your BrandWordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your Brandgvaughan
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxNavinnSomaal
 
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks..."LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...Fwdays
 
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticsKotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticscarlostorres15106
 
CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):comworks
 
DevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache MavenDevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache MavenHervé Boutemy
 
AI as an Interface for Commercial Buildings
AI as an Interface for Commercial BuildingsAI as an Interface for Commercial Buildings
AI as an Interface for Commercial BuildingsMemoori
 
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024BookNet Canada
 
Story boards and shot lists for my a level piece
Story boards and shot lists for my a level pieceStory boards and shot lists for my a level piece
Story boards and shot lists for my a level piececharlottematthew16
 
Unraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdfUnraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdfAlex Barbosa Coqueiro
 
Search Engine Optimization SEO PDF for 2024.pdf
Search Engine Optimization SEO PDF for 2024.pdfSearch Engine Optimization SEO PDF for 2024.pdf
Search Engine Optimization SEO PDF for 2024.pdfRankYa
 
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Patryk Bandurski
 
Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Mattias Andersson
 
Install Stable Diffusion in windows machine
Install Stable Diffusion in windows machineInstall Stable Diffusion in windows machine
Install Stable Diffusion in windows machinePadma Pradeep
 
Streamlining Python Development: A Guide to a Modern Project Setup
Streamlining Python Development: A Guide to a Modern Project SetupStreamlining Python Development: A Guide to a Modern Project Setup
Streamlining Python Development: A Guide to a Modern Project SetupFlorian Wilhelm
 
The Future of Software Development - Devin AI Innovative Approach.pdf
The Future of Software Development - Devin AI Innovative Approach.pdfThe Future of Software Development - Devin AI Innovative Approach.pdf
The Future of Software Development - Devin AI Innovative Approach.pdfSeasiaInfotech2
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsMark Billinghurst
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek SchlawackFwdays
 
Powerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time ClashPowerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time Clashcharlottematthew16
 

Dernier (20)

Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
 
WordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your BrandWordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your Brand
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptx
 
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks..."LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
 
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticsKotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
 
CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):
 
DevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache MavenDevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache Maven
 
AI as an Interface for Commercial Buildings
AI as an Interface for Commercial BuildingsAI as an Interface for Commercial Buildings
AI as an Interface for Commercial Buildings
 
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
 
Story boards and shot lists for my a level piece
Story boards and shot lists for my a level pieceStory boards and shot lists for my a level piece
Story boards and shot lists for my a level piece
 
Unraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdfUnraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdf
 
Search Engine Optimization SEO PDF for 2024.pdf
Search Engine Optimization SEO PDF for 2024.pdfSearch Engine Optimization SEO PDF for 2024.pdf
Search Engine Optimization SEO PDF for 2024.pdf
 
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
 
Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?
 
Install Stable Diffusion in windows machine
Install Stable Diffusion in windows machineInstall Stable Diffusion in windows machine
Install Stable Diffusion in windows machine
 
Streamlining Python Development: A Guide to a Modern Project Setup
Streamlining Python Development: A Guide to a Modern Project SetupStreamlining Python Development: A Guide to a Modern Project Setup
Streamlining Python Development: A Guide to a Modern Project Setup
 
The Future of Software Development - Devin AI Innovative Approach.pdf
The Future of Software Development - Devin AI Innovative Approach.pdfThe Future of Software Development - Devin AI Innovative Approach.pdf
The Future of Software Development - Devin AI Innovative Approach.pdf
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR Systems
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
 
Powerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time ClashPowerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time Clash
 

[232] TensorRT를 활용한 딥러닝 Inference 최적화

  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14. Step 1: TF모델을 TRT 포맷으로 변환 Step 2: 모델 Parser 생성 Step 3: 입/출력 레이어 정보 입력 Step 4: 모델의 최적화 및 런타임 Engine 생성 Step 5: 엔진을 파일로 저장 Step 6: 엔진을 파일에서 읽음 Step 7: Inference 수행
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 24. • • • PReLUPlugin::PReLUPlugin(const Weights *weights, int nbWeights) { mWeights = weights[0]; mWeights.values = malloc(mWeights.count * type2size(mWeights.type)); memcpy(const_cast<void *>(mWeights.values), weights[0].values, mWeights.count * type2size(mWeights.type)); }
  • 25. int PReLUPlugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream) { const float zerof{0.0f}; const __half zeroh = fp16::__float2half(0.0f); if (mWeights.type == DataType::__float) { CHECK(Forward_gpu<__float>(batchSize * mNbInputCount, mNbInputChannels, mNbInputHeight * mNbInputHeight, reinterpret_cast<const __float *>(mDeviceKernel), reinterpret_cast<const __float *>(inputs[0]), reinterpret_cast<__float *>(outputs[0]), zerof, mChannelShared ? mNbInputChannels : 1, stream)); } else { // DataType::kFLOAT } return 0; }
  • 26. template <typename Ftype> __global__ void PReLUForward(const int n, const int channels, const int dim, const Ftype* slope_data, const Ftype* in, Ftype* out, const Ftype zero, const int div_factor) { CUDA_KERNEL_LOOP(index, n) { int c = (index / dim) % channels / div_factor; out[index] = (in[index] > (Ftype(zero))) ? in[index] : in[index] * *(reinterpret_cast<const Ftype*>(slope_data)+c); } }
  • 27. template <typename Ftype> cudaError_t Forward_gpu(const int count, const int channels, const int dim, const Ftype* mDeviceKernel, const Ftype* bottom_data, Ftype* top_data, const Ftype zero, const int div_factor, const cudaStream_t stream) { PReLUForward<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>> (count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor); return cudaGetLastError(); }
  • 28.
  • 29. IPluginExt *PReLUPlugin::clone() const override { return new PReLUPlugin(&mWeights, 1); } IPlugin* pluginFactory::createPlugin(const char* layerName, const Weights* serialData, int nbWeights) override { return new PReLUPlugin(serialData, serialLength); }
  • 30. PluginFactory parserPluginFactory; parser->setPluginFactoryExt(&parserPluginFactory); const IBlobNameToTensor *blobNameToTensor = parser->parse(gParams.deployFile.c_str(), // caffe deploy file gParams.modelFile.c_str(), // caffe model file *network, // network definition that the parser will populate gParams.fp16 ? DataType::kHALF : DataType::kFLOAT);
  • 32. void PReLUPlugin::serialize(void *buffer) { char *d = static_cast<char *>(buffer), *a = d; write(d, mNbInputChannels); write(d, mNbInputHeight); write(d, mNbInputWidth); write(d, mNbInputCount); write(d, mChannelShared); write(d, mWeights.count); write(d, mWeights.type); convertAndCopyToBuffer(d, mWeights); assert(d == a + getSerializationSize()); }
  • 33. PReLUPlugin::PReLUPlugin(const void *data, size_t length) { const char *d = static_cast<const char *>(data), *a = d; read<int>(d, mNbInputChannels); read<int>(d, mNbInputHeight); read<int>(d, mNbInputWidth); read<int>(d, mNbInputCount); read<bool>(d, mChannelShared); read<int64_t>(d, mWeights.count); read<DataType>(d, mWeights.type); mWeights.values = malloc(mWeights.count * type2size(mWeights.type)); memcpy(const_cast<void *>(mWeights.values), d, mWeights.count * type2size(mWeights.type)); deserializeToDevice(d, mDeviceKernel, mWeights.count * type2size(mWeights.type)); assert(d == a + length); }
  • 34. Iplugin *PluginFactory::createPlugin(const char *layerName, const void *serialData, size_t serialLength) override { return new PReLUPlugin(serialData, serialLength); }
  • 35. PluginFactory pluginFactory; engine = infer->deserializeCudaEngine(trt_plan_file, size, &pluginFactory);
  • 36. cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream); context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr); cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); cudaStreamCreate(&stream)); IExecutionContext* context = engine->createExecutionContext();