모델 컨버팅

증강 내용 복습

# transform: 전처리 변형

# augmentation: 늘리기

# 1) online augmentation : 증강한거를 메모리 상에서만 존재 (데이터 조회가 빠름 다만 RAM부담)

# 2) offline augmentation : 증강한거를 하드에 저장 (빅데이터에 대해서 좋지만 조회가 느림)

import albumentations as A
import matplotlib.pyplot as plt
from PIL import Image

image = plt.imread('/content/training_set/dogs/dog.1.jpg')

transform = A.Compose([ # 앨버멘테이션의 전처리 파이프라인을 만들때 Compose를 쓴다.
    A.Resize(256, 256),
    A.Blur(p=0.5),
    A.HorizontalFlip(p=1),
])

# 이미지 변환 결과로 딕셔너리 타임으로 리턴됨
transformed = transform(image=image)

# 키 인덱싱으로 이미지 가져와야 함
transformed_image = transformed['image']

# 배열로 되어있다 보니까, pillow를 통해 이미지 타입으로 변환해줘야 jpg로 저장이 가능함
im = Image.fromarray(transformed_image)
im.save("test.jpg")
plt.imshow(transformed_image)
plt.xticks([]); plt.yticks([])
plt.show()

모델 최적화와 컨버트

추론 가속화 : Torch-TensorRT

# Torch-TensorRT 변환
print("Converting to Torch-TensorRT...")
model.eval()  # 모델을 평가 모드로 전환하여 학습 중 업데이트 방지
trt_model = torch_tensorrt.compile(model,
    inputs=[torch_tensorrt.Input(
        min_shape=[1, 1, 28, 28],   # 최소 입력 크기: 모델이 최소한으로 처리할 수 있는 입력 크기 지정(오류 방지)
        opt_shape=[batch_size, 1, 28, 28],  # 최적 입력 크기: 최적의 성능을 내도록 추천하는 입력 크기
        max_shape=[batch_size, 1, 28, 28],  # 최대 입력 크기: 메모리 관리 및 안정성을 위해 허용되는 최대 입력 크기 지정
        dtype=torch.float32)],  # 입력 데이터 타입
    enabled_precisions={torch.float32}, # 32비트 연산을 사용해 정확성을 유지 (fp16 등은 정밀도 손실 위험이 있음)
    workspace_size=1 << 30   # 비트연산자. 2의30승=1GB GPU 메모리 할당. 빌드 시 최적화를 위해 1GB 메모리를 할당, 시스템 자원 한계 고려
)

경량화(프루닝)

학습된 모델의 Weight를 사용하지 않는 것이다.
(주의: Drop Out은 모델 학습시, 랜덤하게 노드를 끄는 방법이다.)

1) Unstructured Pruning: weight를 0으로 만들고 사용하지 않는다.

2) Structured Pruning: weight를 날린다.

# 프루닝 적용 함수
def apply_pruning(model, prune_amount):  # 인자, 학습완료된 모델, 프루닝 정도를
    parameters_to_prune = (        # 모델에서 conv1, conv2에대해서 프루닝 할거다.
        (model.conv1, 'weight'),
        (model.conv2, 'weight'),
    )
    for module, param in parameters_to_prune: # 순차적으로 전달해줄것.
            # 어떤부분? (예 model.conv1), 'weight',   프루닝 정도
        prune.l1_unstructured(module, name=param, amount=prune_amount)
        print(f'Pruned {param} in {module}')

# 프루닝 마스크 확인 함수
def print_pruning_masks(model):
    # 모델의 모든 서브 모듈을 순회하면서 이름과 모듈을 가져옴

        # conv1
        # conv2
        # fc1
        # fc2
    for name, module in model.named_modules():
        # 현재 모듈이 2D 컨볼루션 레이어인지 확인
        if isinstance(module, nn.Conv2d):
            # 현재 컨볼루션 레이어에 'weight_mask' 속성이 있는지 확인
            if hasattr(module, 'weight_mask'): #
                # weight_mask의 활성화된 가중치 수와 전체 가중치 수를 출력
                # weight_mask.sum()은 마스크에서 1로 표시된(남아있는) 가중치의 수를 셈
                # weight_mask.numel()은 마스크의 전체 요소 수(전체 가중치 수)를 반환
                print(f'{name}.weight_mask: {module.weight_mask.sum().item()} / {module.weight_mask.numel()}')

# 전체 훈련 과정
for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, criterion, epoch)

# 초기 평가
print("=== Initial Evaluation ===")
evaluate(model, device, test_loader)

# 프루닝 적용
print("\n=== Applying Pruning ===")
apply_pruning(model, prune_amount) # ******** 중요

# 프루닝 마스크 확인
print("\n=== Pruning Masks ===")
print_pruning_masks(model)

# 프루닝 후 평가
print("\n=== Evaluation After Pruning ===")
evaluate(model, device, test_loader)

# conv1 레이어에서는 총 288개의 가중치 중 202개 남음
# conv2 레이어에서는 총 18,432개의 가중치 중 12,902개 남음

경량화 모바일 변환

# 1. TorchScript로 변환 (Tracing 방식): 샘플 입력을 모델에 넣고, 그 입력이 통과하면서 실행되는 연산 경로를 "추적"해서 정적 연산 그래프를 생성
print("Converting to TorchScript...")
model.eval()  # 평가 모드로 전환
sample_input = torch.randn(1, 1, 28, 28).to(device)  # 모바일용 고정 입력 크기 (배치 1)
traced_model = torch.jit.trace(model, sample_input)  # Tracing으로 TorchScript 모델 생성
traced_model.save("autoencoder.pt")  # 기본 TorchScript 모델 저장 -> 모바일에서 torchscript로 로드가능.(그러나 양자화X)
print("Saved TorchScript model as 'autoencoder.pt'")

# 2. 양자화 (Dynamic Quantization)
print("Applying dynamic quantization...")
                  # 토치에서 동적 양자화라는 게 있음.
quantized_model = torch.quantization.quantize_dynamic(
    model.cpu(),  # 원본 모델을 CPU로 이동 (CUDA에서 양자화 실행하면 Tracing 문제가 발생할 수 있어 이를 방지)
    {nn.Linear},  # 양자화할 레이어 (Linear만 대상)
    dtype=torch.qint8  # 8비트 정수로 변환
)

# 양자화된 모델을 TorchScript로 변환
print("Tracing quantized model on CPU...")
cpu_sample_input = torch.randn(1, 1, 28, 28)  # CPU용 샘플 입력 (디바이스 지정 없음 = CPU)
quantized_traced_model = torch.jit.trace(quantized_model, cpu_sample_input)  # CPU에서 Tracing
quantized_traced_model.save("autoencoder_quantized.pt")  # 양자화된 TorchScript 모델 저장
print("Saved quantized TorchScript model as 'autoencoder_quantized.pt'")

# 모델 크기 비교 (파일 크기 확인)
import os
original_size = os.path.getsize("autoencoder.pt") / 1024  # KB 단위
quantized_size = os.path.getsize("autoencoder_quantized.pt") / 1024  # KB 단위
print(f"Original model size: {original_size:.2f} KB")
print(f"Quantized model size: {quantized_size:.2f} KB")

onnx를 통한 프레임워크 Convert

- 프레임워크는 다음과 같이 다양하다
- ex) 파이토치, onnx, 텐서 플로우, ultralytics, 텐서 플로우 라이트

# ONNX로 변환
print("Converting PyTorch model to ONNX...")
model.eval()  # 평가 모드로 전환
sample_input = torch.randn(1, 1, 28, 28).to(device)  # ONNX 변환용 샘플 입력 (배치 1)

# ONNX 내보내기
torch.onnx.export(
    model,                    # 변환할 PyTorch 모델
    sample_input,             # 샘플 입력 (입력 크기 정의용)
    "autoencoder.onnx",       # 저장할 ONNX 파일 이름
    export_params=True,       # 학습된 가중치 포함
    opset_version=11,         # ONNX 연산 집합 버전 (최신 프레임워크와 호환성)
    do_constant_folding=True, # 상수 접기 최적화 적용
    input_names=['input'],    # 입력 텐서 이름 정의
    output_names=['output']   # 출력 텐서 이름 정의
)
print("Saved ONNX model as 'autoencoder.onnx'")

# ONNX로 실행해 보기
import onnxruntime as ort
import numpy as np

# ONNX 모델 로드
print("Loading ONNX model in ONNX Runtime...")
session = ort.InferenceSession("autoencoder.onnx") # 모델 인스턴스

# 입력 준비
input_name = session.get_inputs()[0].name  # 'input' (ONNX에서 정의한 이름)
sample_input = np.random.randn(1, 1, 28, 28).astype(np.float32)  # ONNX Runtime은 NumPy 배열 사용

# 추론 실행
print("Running inference with ONNX Runtime...")
outputs = session.run(None, {input_name: sample_input}) # None: 모든 출력값을 반환하도록 요청하는 것
output_data = outputs[0]  # 첫 번째 출력 텐서

# 결과 확인
print("Output shape:", output_data.shape)  # 예상: (1, 784) 모델 오토인코더의 인풋은 28*28이였고, 아웃풋도 28*28이었음
print("Sample output (first 10 elements):", output_data.flatten()[:10])

# ONNX를 다른 프레임워크로 변경
from onnx_tf.backend import prepare
import onnx

onnx_model = onnx.load("autoencoder.onnx") # onnx 파일을 불러와서
tf_model = prepare(onnx_model) # onnx_tf 라이브러리로 텐서플로 타입으로 바꿈
tf_model.export_graph("autoencoder_tf")  # TensorFlow SavedModel로 저장

Yolo 경량화와 모바일 변환

from ultralytics import YOLO
import torch

# 모델 불러오기 및 데이터셋 지정
model = YOLO("yolov8n.pt")
data_yaml = "/content/roboflow/data.yaml"

# 모델 학습 (에폭 수, 이미지 크기 등은 필요에 따라 조정)
model.train(data=data_yaml, epochs=1, imgsz=640)

# 경량화: 동적 양자화 적용 (모바일 환경에서 추론 속도 개선 및 모델 크기 감소)
model.model = torch.quantization.quantize_dynamic(
    model.model, {torch.nn.Linear, torch.nn.Conv2d}, dtype=torch.qint8
)

# ONNX 형식으로 내보내기
model.export(format="onnx")

Grad CAM

XAI : explainable ai <- 지금 배우는 거는 이거

xAI : 응용 ai

종류

1. 머신러닝 XAI: feature importance

2. LLM XAI:

- LLM huggingface - IntegratedGradients 토큰 중요도

- torchframework - 어텐션 스코어

- openai

- ollama

- langchain

3. LLM RAG XAI: 문장 - 어떤 청킹벡터를 봤는지 출력

4. CV XAI: gradCAM

5. Timeserise XAI: time transformer

ex) LSTM(cell state), 트랜스포머(어텐션)

6. sound XAI: 스펙토그램 같은 경우 CAM을 쓰기도.