Char-level CNN

So I was trying to classify paragraphs into respective groups with characters all swapped up and spaces removed. At first I thought of using a combination of MCMC word decryption and space inferencing for preprocessing step so that I could use RNN naturally to perform classification, however to no avail, it kept on getting trapped in local minima. Then I tried to search for better solution and I’ve stumbled upon Character-level Convolutional Neural Network for Text Classification. The network was designed as follows:

Schema

char_level_rnn

Learning Curve

Screen Shot 2018-02-20 at 16.19.22

This dataset consists of 26 characters i.e. no information on spacing, period, starting of a sentence and so on. Probably that’s the reason why the validation error didn’t decrease as much. However, with 30% validation, the model managed to achieved an accuracy of 70%.

The network

Network 1

class CharCNN(chainer.Chain):

def __init__(self, seq_length, out_size, dropout=0.2, usegpu=True):
super(CharCNN, self).__init__()

with self.init_scope():
self.encoder = Encoder(27, 386, dropout)

self.conv1 = L.Convolution2D(
386, 386, ksize=(7, 1), stride=1, pad=(3, 0), initialW=I.Normal(0.025))
self.conv2 = L.Convolution2D(
386, 386, ksize=(7, 1), stride=1, pad=(3, 0), initialW=I.Normal(0.025))
self.conv3 = L.Convolution2D(
386, 386, ksize=(3, 1), stride=1, pad=(1, 0), initialW=I.Normal(0.025))
self.conv4 = L.Convolution2D(
386, 386, ksize=(3, 1), stride=1, pad=(1, 0), initialW=I.Normal(0.025))
self.conv5 = L.Convolution2D(
386, 386, ksize=(3, 1), stride=1, pad=(1, 0), initialW=I.Normal(0.025))
self.conv6 = L.Convolution2D(
386, 386, ksize=(3, 1), stride=1, pad=(1, 0), initialW=I.Normal(0.025))

self.fc1 = L.Linear(None, 386)
self.fc2 = L.Linear(386, 386)
self.fc3 = L.Linear(386, out_size)
self.usegpu = usegpu
self.dropout = dropout

def __call__(self, x):
h0 = self.encoder(x)

h1 = F.relu(self.conv1(h0))
h2 = F.max_pooling_2d(h1, (3, 1), 1, (1, 0))
h3 = F.relu(self.conv2(h2))
h4 = F.max_pooling_2d(h3, (3, 1), 1, (1, 0))
h5 = F.relu(self.conv3(h4))
h6 = F.relu(self.conv4(h5))
h7 = F.relu(self.conv5(h6))

h8 = F.relu(self.conv6(h7))
h9 = F.max_pooling_2d(h8, (3, 1), 1, (1, 0))

h10 = F.relu(self.fc1(h9))
h11 = F.relu(self.fc2(F.dropout(h10, ratio=self.dropout)))
h12 = self.fc3(F.dropout(h11, ratio=self.dropout))

if chainer.config.train:
return h12
return F.softmax(h12)

Network 2

Network 2 is lighter in terms of computations however sharing the same performance. I shall try RNN next.

class CharCNN(chainer.Chain):

    def __init__(self, seq_length, out_size, dropout=0.2, usegpu=True):
        super(CharCNN, self).__init__()

        with self.init_scope():
            self.encoder = Encoder(27, 54, dropout)
        
            self.bn0 = L.BatchNormalization((54, 452))
            self.conv1 = L.Convolution2D(
                54, 108, ksize=(7, 1), stride=2, pad=(0, 0), initialW=I.Normal(0.025))
            
            # ceil((452 - 7 + 1) / 2) = 223
            
            self.bn1 = L.BatchNormalization((108, 221))
            self.conv2 = L.Convolution2D(
                108, 216, ksize=(7, 1), stride=2, pad=(0, 0), initialW=I.Normal(0.025))
            
            self.bn2 = L.BatchNormalization((216, 106))
            self.conv3 = L.Convolution2D(
                216, 512, ksize=(3, 1), stride=2, pad=(0, 0), initialW=I.Normal(0.025))
            
            self.bn3 = L.BatchNormalization((512, 50))
            self.conv4 = L.Convolution2D(
                512, 1024, ksize=(3, 1), stride=2, pad=(0, 0), initialW=I.Normal(0.025))
            
            self.bn4 = L.BatchNormalization((1024, 22))
            self.conv5 = L.Convolution2D(
                1024, 2048, ksize=(3, 1), stride=1, pad=(0, 0), initialW=I.Normal(0.025))
            
            self.bn5 = L.BatchNormalization((2048, 18))
            self.conv6 = L.Convolution2D(
                2048, 4096, ksize=(3, 1), stride=1, pad=(0, 0), initialW=I.Normal(0.025))
            
            
            self.bn6 = L.BatchNormalization((4096, 14))
            self.fc1 = L.Linear(None, out_size)
            self.usegpu = usegpu
            self.dropout = dropout


    def __call__(self, x):
        h_0_1  = self.encoder(x)
        h_0_2 = self.bn0(h_0_1)

        h_1_1 = F.leaky_relu(self.conv1(h_0_2)) # 223
        h_1_2 = F.max_pooling_2d(h_1_1, ksize=(3, 1), stride=1, pad=(0, 0)) # 221
        h_1_3 = self.bn1(h_1_2)
        
        h_2_1 = F.leaky_relu(self.conv2(h_1_3)) # 108
        h_2_2 = F.max_pooling_2d(h_2_1, ksize=(3, 1), stride=1, pad=(0, 0)) # 106
        h_2_3 = self.bn2(h_2_2)
        
        h_3_1 = F.leaky_relu(self.conv3(h_2_3)) #52
        h_3_2 = F.max_pooling_2d(h_3_1, ksize=(3, 1), stride=1, pad=(0, 0)) # 50
        h_3_3 = self.bn3(h_3_2)
        
        h_4_1 = F.leaky_relu(self.conv4(h_3_3)) # 24
        h_4_2 = F.max_pooling_2d(h_4_1, ksize=(3, 1), stride=1, pad=(0, 0)) # 22
        h_4_3 = self.bn4(h_4_2)
        
        h_5_1 = F.leaky_relu(self.conv5(h_4_3)) # 20
        h_5_2 = F.max_pooling_2d(h_5_1, ksize=(3, 1), stride=1, pad=(0, 0)) # 18
        h_5_3 = self.bn5(h_5_2)
        
        h_6_1 = F.leaky_relu(self.conv6(h_5_3)) # 16
        h_6_2 = F.max_pooling_2d(h_6_1, ksize=(3, 1), stride=1, pad=(0, 0)) # 14
        h_6_3 = self.bn6(h_6_2)
        
        h7 = F.average_pooling_2d(h_6_3, ksize=(14, 1), stride=1, pad=(0, 0)) # op kernel

        h8 = self.fc1(h7)
#         h11 = F.relu(self.fc2(F.dropout(h10, ratio=self.dropout)))
#         h12 = self.fc3(F.dropout(h11, ratio=self.dropout))
        
        if chainer.config.train:
            return h8
        return F.softmax(h8)

References:

  1. Char level CNN

Leave a Reply