quantization

김지훈
Commit 103822ab683a7bd65b15a6e5851c0fd11d6d7178 103822ab 1 parent 92dff93a
Showing 4 changed files with 352 additions and 0 deletions
코드/연합학습/quantization/README.md
코드/연합학습/quantization/fed_train.py
코드/연합학습/quantization/model.py
코드/연합학습/quantization/utils.py
--- a/코드/연합학습/quantization/README.md 0 → 100644
View file @103822a
+++ b/코드/연합학습/quantization/README.md 0 → 100644
View file @103822a
+# 추론시간 개선 - 양자화 시도
+
+## Pytorch quantization - 학습해도 cpu 에서만 실행 가능, 모델의 채널을 신중하게 고르지 않으면 속도 개선 미미함. 또한 양자화 과정으로 학습된 모델은 pytorch model -> onnx -> tensorRT 변환이 불가능하여 gpu 에서 실행 불가능 학습해도 cpu 에서만 실행 가능, 모델의 채널을 신중하게 고르지 않으면 속도 개선 미미함
+
+TensorRT - 양자화 학습을 사용하지 않고 바로 정밀도 감소 및 양자화 시도. float16 은 10% 정도 속도가 개선되었으나, int8 은 실패함 (사용법 미숙, 입력 값이 0.0 ~ 1.0 등)
\ No newline at end of file
--- a/코드/연합학습/quantization/fed_train.py 0 → 100644
View file @103822a
+++ b/코드/연합학습/quantization/fed_train.py 0 → 100644
View file @103822a
+import utils
+import copy
+from collections import OrderedDict
+
+import model
+import dataset
+
+import importlib
+importlib.reload(utils)
+importlib.reload(model)
+importlib.reload(dataset)
+
+from utils import *
+import torch.quantization
+
+
+def add_args(parser):
+    # parser.add_argument('--model', type=str, default='moderate-cnn',
+    #                     help='neural network used in training')
+    parser.add_argument('--dataset', type=str, default='cifar10', metavar='N',
+                        help='dataset used for training')
+    parser.add_argument('--fold_num', type=int, default=0, 
+                        help='5-fold, 0 ~ 4')
+    parser.add_argument('--batch_size', type=int, default=256, metavar='N',
+                        help='input batch size for training')
+    parser.add_argument('--lr', type=float, default=0.002, metavar='LR',
+                        help='learning rate')
+    parser.add_argument('--n_nets', type=int, default=100, metavar='NN',
+                        help='number of workers in a distributed cluster')
+    parser.add_argument('--comm_type', type=str, default='fedtwa', 
+                            help='which type of communication strategy is going to be used: layerwise/blockwise')    
+    parser.add_argument('--comm_round', type=int, default=10, 
+                            help='how many round of communications we shoud use')
+    args = parser.parse_args(args=[])
+    return args
+
+
+def start_fedavg(fed_model, args,
+                          train_data_set,
+                          data_idx_map,
+                          net_data_count,
+                          testloader,
+                          device):
+    print("start fed avg")
+    criterion = nn.CrossEntropyLoss()
+    C = 0.1
+    num_edge = int(max(C * args.n_nets, 1))
+    total_data_count = 0
+    for _, data_count in net_data_count.items():
+        total_data_count += data_count
+    print("total data: %d" % total_data_count)
+
+    # quantize
+    # fed_model.eval()
+    # torch.jit.save(torch.jit.script(fed_model), './float.pth')
+    # return
+    fed_model.fuse_model()
+    # modules_to_fuse = [['conv1', 'relu1'], ['conv2', 'relu2'], ['conv3', 'relu3']]
+    # torch.quantization.fuse_modules(fed_model, modules_to_fuse, inplace=True)
+
+    fed_model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
+    torch.quantization.prepare_qat(fed_model, inplace=True)
+    
+    # for making shape of weight_fake_quant.scale
+    train_data_set.set_idx_map([0])
+    fed_model(torch.from_numpy(np.expand_dims(train_data_set[0][0], axis=0)).float())
+
+    edges, _, _ = init_models(args.n_nets, args)
+    # edges = [copy.deepcopy(fed_model) for net_cnt in range(args.n_nets)]
+    for edge_now in edges:
+        edge_now.fuse_model()
+        edge_now.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
+        torch.quantization.prepare_qat(edge_now, inplace=True)
+        edge_now(torch.from_numpy(np.expand_dims(train_data_set[0][0], axis=0)).float())
+
+    # print('quantized \n', edges[edge_index].conv1)
+    # end
+
+    for cr in range(1, args.comm_round + 1):
+        print("Communication round : %d" % (cr))
+
+        np.random.seed(cr)  # make sure for each comparison, select the same clients each round
+        selected_edge = np.random.choice(args.n_nets, num_edge, replace=False)
+        print("selected edge", selected_edge)
+
+        for edge_progress, edge_index in enumerate(selected_edge):
+            train_data_set.set_idx_map(data_idx_map[edge_index])
+            train_loader = torch.utils.data.DataLoader(train_data_set, batch_size=args.batch_size,
+                                                    shuffle=True, num_workers=2)
+            print("[%2d/%2d] edge: %d, data len: %d" % (edge_progress, len(selected_edge), edge_index, len(train_data_set)))
+
+            edges[edge_index] = copy.deepcopy(fed_model)
+            edges[edge_index].to(device)
+            edges[edge_index].train()
+            edge_opt = optim.Adam(params=edges[edge_index].parameters(), lr=args.lr)
+            # train
+            for data_idx, (inputs, labels) in enumerate(train_loader):
+                inputs, labels = inputs.float().to(device), labels.long().to(device)
+
+                edge_opt.zero_grad()
+                # edge_opt[edge_index].zero_grad()
+                edge_pred = edges[edge_index](inputs)
+
+                edge_loss = criterion(edge_pred, labels)
+                edge_loss.backward()
+
+                edge_opt.step()
+                # edge_opt[edge_index].step()
+                edge_loss = edge_loss.item()
+                if data_idx % 100 == 0:
+                    print('[%4d] loss: %.3f' % (data_idx, edge_loss))
+                    break
+            edges[edge_index].to('cpu')
+            # print(edge_index)
+            # local_state = edges[edge_index].state_dict()
+            # for key in edges[edge_index].state_dict().keys():
+            #     if 'activation_post_process' in key or 'fake_quant' in key:
+            #         print(key, local_state[key])
+            # print()
+        # return
+        # cal weight using fed avg
+        update_state = OrderedDict()
+        for k, edge in enumerate(edges):
+            local_state = edge.state_dict()
+            for key in fed_model.state_dict().keys():
+                # if 'zero_point' in key:
+                #     print(local_state[key])
+                if 'activation_post_process' in key or 'fake_quant' in key:
+                    if k == 0:
+                        update_state[key] = local_state[key]
+                    else:
+                        update_state[key] += local_state[key]
+                elif 'enable' in key:
+                    update_state[key] = local_state[key]
+                else:
+                    if k == 0:
+                        update_state[key] = local_state[key] * (net_data_count[k] / total_data_count)
+                    else:
+                        update_state[key] += local_state[key] * (net_data_count[k] / total_data_count)
+            # break
+        for key in update_state.keys():
+            if 'enable' in key:
+                continue
+            if 'activation_post_process' in key or 'fake_quant' in key:
+                # print(key, update_state[key], update_state[key].type())
+                # print(key, update_state[key])
+                if torch.is_floating_point(update_state[key]):
+                    update_state[key] = update_state[key] / args.n_nets
+                else:
+                    update_state[key] = torch.floor_divide(update_state[key], args.n_nets)
+                # print(update_state[key])
+
+        fed_model.load_state_dict(update_state)
+        if cr % 1 == 0:
+            fed_model.to(device)
+            fed_model.eval()
+
+            total_loss = 0.0
+            cnt = 0
+            step_acc = 0.0
+            with torch.no_grad():
+                for i, data in enumerate(testloader):
+                    inputs, labels = data
+                    inputs, labels = inputs.float().to(device), labels.long().to(device)
+
+                    outputs = fed_model(inputs)
+                    _, preds = torch.max(outputs, 1)
+
+                    loss = criterion(outputs, labels)
+                    cnt += inputs.shape[0]
+
+                    corr_sum = torch.sum(preds == labels.data)
+                    step_acc += corr_sum.double()
+                    running_loss = loss.item() * inputs.shape[0]
+                    total_loss += running_loss
+                    if i % 200 == 0:
+                      print('test [%4d] loss: %.3f' % (i, loss.item()))
+                      break
+            print((step_acc / cnt).item())
+            print(total_loss / cnt)
+            fed_model.to('cpu')
+            quantized_fed_model = torch.quantization.convert(fed_model.eval(), inplace=False)
+            torch.jit.save(torch.jit.script(quantized_fed_model), './quan.pth')
+
+
+
+def start_train():
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    print(device)
+    args = add_args(argparse.ArgumentParser())
+
+    seed = 0
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    print("Loading data...")
+    # kwargs = {"./dataset/DoS_dataset.csv" : './DoS_dataset.txt',
+    #         "./dataset/Fuzzy_dataset.csv" : './Fuzzy_dataset.txt',
+    #         "./dataset/RPM_dataset.csv" : './RPM_dataset.txt',
+    #         "./dataset/gear_dataset.csv" : './gear_dataset.txt'
+    # }
+    kwargs = {"./dataset/DoS_dataset.csv" : './DoS_dataset.txt'}
+    train_data_set, data_idx_map, net_class_count, net_data_count, test_data_set = dataset.GetCanDatasetUsingTxtKwarg(args.n_nets, args.fold_num, **kwargs)
+    testloader = torch.utils.data.DataLoader(test_data_set, batch_size=args.batch_size,
+                                            shuffle=False, num_workers=2)
+
+    run_benchmark('./quan.pth', testloader)
+    run_benchmark('./float.pth', testloader)
+    # run_benchmark('./quan.pth', testloader)
+
+    fed_model = model.Net()
+    args.comm_type = 'fedavg'
+    if args.comm_type == "fedavg":
+        start_fedavg(fed_model, args,
+                            train_data_set,
+                            data_idx_map,
+                            net_data_count,
+                            testloader,
+                            device)
+
+if __name__ == "__main__":
+    start_train()
--- a/코드/연합학습/quantization/model.py 0 → 100644
View file @103822a
+++ b/코드/연합학습/quantization/model.py 0 → 100644
View file @103822a
+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+from torch.quantization import QuantStub, DeQuantStub
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(1, 8, 3),
+            nn.ReLU(True),
+        )
+        self.conv2 = nn.Sequential(
+          nn.Conv2d(8, 8, 3),
+          nn.ReLU(True),
+        )
+        self.conv3 = nn.Sequential(
+          nn.Conv2d(8, 8, 3),
+          nn.ReLU(True),
+        )
+        self.fc4 = nn.Linear(8 * 23 * 23, 2)
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x = torch.flatten(x, 1)
+        x = self.fc4(x)
+        x = self.dequant(x)
+        return x
+  
+    def fuse_model(self):
+        for m in self.modules():
+          if type(m) == nn.Sequential:
+              torch.quantization.fuse_modules(m, ['0', '1'], inplace=True)
--- a/코드/연합학습/quantization/utils.py 0 → 100644
View file @103822a
+++ b/코드/연합학습/quantization/utils.py 0 → 100644
View file @103822a
+import os
+import argparse
+import json
+import numpy as np
+import torch
+import torch.optim as optim
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as transforms
+import torch.utils.data as data
+import math
+import copy
+import time
+
+import model
+import torch.quantization
+from torch.quantization import QuantStub, DeQuantStub
+
+
+def run_benchmark(model_file, img_loader):
+    elapsed = 0
+    # myModel = torch.jit.load(model_file)
+    # torch.backends.quantized.engine='fbgemm'
+    # myModel.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
+    # myModel.eval()
+    myModel = model.Net()
+    # myModel = torch.quantization.quantize_dynamic(myModel, {torch.nn.Linear, torch.nn.Sequential}, dtype=torch.qint8)
+    # print(myModel)
+    # set quantization config for server (x86)
+    myModel.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
+    num_batches = 10
+    # # insert observers
+    torch.quantization.prepare(myModel, inplace=True)
+    # # Calibrate the model and collect statistics
+    with torch.no_grad():
+      for i, (images, target) in enumerate(img_loader):
+          images = images.float()
+          target = target.long()
+          if i < num_batches:
+              start = time.time()
+              output = myModel(images)
+              end = time.time()
+              # elapsed = elapsed + (end-start)
+          else:
+              break
+
+    # # convert to quantized version
+    torch.quantization.convert(myModel, inplace=True)
+
+    # quant = QuantStub()
+    with torch.no_grad():
+      for i, (images, target) in enumerate(img_loader):
+          images = images.float()
+          target = target.long()
+          if i < num_batches:
+              start = time.time()
+              output = myModel(images)
+              end = time.time()
+              elapsed = elapsed + (end-start)
+          else:
+              break
+    num_images = images.size()[0] * num_batches
+    print(elapsed)
+    print('Elapsed time: %3.0f ms' % (elapsed/num_images*1000))
+    return elapsed
+
+
+def init_models(n_nets, args):
+    models = []
+    layer_shape = []
+    layer_type = []
+
+    for idx in range(n_nets):
+        # if args.model == "lenet":
+        #     cnn = LeNet()
+        # elif args.model == "vgg":
+        #     cnn = vgg11()
+        models.append(model.Net())
+
+    for (k, v) in models[0].state_dict().items():
+        layer_shape.append(v.shape)
+        layer_type.append(k)
+
+    return models, layer_shape, layer_type
+    
\ No newline at end of file