Char-level CNN

So I was trying to classify paragraphs into respective groups with characters all swapped up and spaces removed. At first I thought of using a combination of MCMC word decryption and space inferencing for preprocessing step so that I could use RNN naturally to perform classification, however to no avail, it kept on getting trapped in local minima. Then I tried to search for better solution and I’ve stumbled upon Character-level Convolutional Neural Network for Text Classification. The network was designed as follows:

Schema

char_level_rnn

Learning Curve

Screen Shot 2018-02-20 at 16.19.22

This dataset consists of 26 characters i.e. no information on spacing, period, starting of a sentence and so on. Probably that’s the reason why the validation error didn’t decrease as much. However, with 30% validation, the model managed to achieved an accuracy of 70%.

The network

Network 1

[python]
class CharCNN(chainer.Chain):

def __init__(self, seq_length, out_size, dropout=0.2, usegpu=True):
super(CharCNN, self).__init__()

with self.init_scope():
self.encoder = Encoder(27, 386, dropout)

self.conv1 = L.Convolution2D(
386, 386, ksize=(7, 1), stride=1, pad=(3, 0), initialW=I.Normal(0.025))
self.conv2 = L.Convolution2D(
386, 386, ksize=(7, 1), stride=1, pad=(3, 0), initialW=I.Normal(0.025))
self.conv3 = L.Convolution2D(
386, 386, ksize=(3, 1), stride=1, pad=(1, 0), initialW=I.Normal(0.025))
self.conv4 = L.Convolution2D(
386, 386, ksize=(3, 1), stride=1, pad=(1, 0), initialW=I.Normal(0.025))
self.conv5 = L.Convolution2D(
386, 386, ksize=(3, 1), stride=1, pad=(1, 0), initialW=I.Normal(0.025))
self.conv6 = L.Convolution2D(
386, 386, ksize=(3, 1), stride=1, pad=(1, 0), initialW=I.Normal(0.025))

self.fc1 = L.Linear(None, 386)
self.fc2 = L.Linear(386, 386)
self.fc3 = L.Linear(386, out_size)
self.usegpu = usegpu
self.dropout = dropout

def __call__(self, x):
h0 = self.encoder(x)

h1 = F.relu(self.conv1(h0))
h2 = F.max_pooling_2d(h1, (3, 1), 1, (1, 0))
h3 = F.relu(self.conv2(h2))
h4 = F.max_pooling_2d(h3, (3, 1), 1, (1, 0))
h5 = F.relu(self.conv3(h4))
h6 = F.relu(self.conv4(h5))
h7 = F.relu(self.conv5(h6))

h8 = F.relu(self.conv6(h7))
h9 = F.max_pooling_2d(h8, (3, 1), 1, (1, 0))

h10 = F.relu(self.fc1(h9))
h11 = F.relu(self.fc2(F.dropout(h10, ratio=self.dropout)))
h12 = self.fc3(F.dropout(h11, ratio=self.dropout))

if chainer.config.train:
return h12
return F.softmax(h12)
[/python]

Network 2

Network 2 is lighter in terms of computations however sharing the same performance. I shall try RNN next.
[python]
class CharCNN(chainer.Chain):

def __init__(self, seq_length, out_size, dropout=0.2, usegpu=True):
super(CharCNN, self).__init__()

with self.init_scope():
self.encoder = Encoder(27, 54, dropout)

self.bn0 = L.BatchNormalization((54, 452))
self.conv1 = L.Convolution2D(
54, 108, ksize=(7, 1), stride=2, pad=(0, 0), initialW=I.Normal(0.025))

# ceil((452 – 7 + 1) / 2) = 223

self.bn1 = L.BatchNormalization((108, 221))
self.conv2 = L.Convolution2D(
108, 216, ksize=(7, 1), stride=2, pad=(0, 0), initialW=I.Normal(0.025))

self.bn2 = L.BatchNormalization((216, 106))
self.conv3 = L.Convolution2D(
216, 512, ksize=(3, 1), stride=2, pad=(0, 0), initialW=I.Normal(0.025))

self.bn3 = L.BatchNormalization((512, 50))
self.conv4 = L.Convolution2D(
512, 1024, ksize=(3, 1), stride=2, pad=(0, 0), initialW=I.Normal(0.025))

self.bn4 = L.BatchNormalization((1024, 22))
self.conv5 = L.Convolution2D(
1024, 2048, ksize=(3, 1), stride=1, pad=(0, 0), initialW=I.Normal(0.025))

self.bn5 = L.BatchNormalization((2048, 18))
self.conv6 = L.Convolution2D(
2048, 4096, ksize=(3, 1), stride=1, pad=(0, 0), initialW=I.Normal(0.025))

self.bn6 = L.BatchNormalization((4096, 14))
self.fc1 = L.Linear(None, out_size)
self.usegpu = usegpu
self.dropout = dropout

def __call__(self, x):
h_0_1 = self.encoder(x)
h_0_2 = self.bn0(h_0_1)

h_1_1 = F.leaky_relu(self.conv1(h_0_2)) # 223
h_1_2 = F.max_pooling_2d(h_1_1, ksize=(3, 1), stride=1, pad=(0, 0)) # 221
h_1_3 = self.bn1(h_1_2)

h_2_1 = F.leaky_relu(self.conv2(h_1_3)) # 108
h_2_2 = F.max_pooling_2d(h_2_1, ksize=(3, 1), stride=1, pad=(0, 0)) # 106
h_2_3 = self.bn2(h_2_2)

h_3_1 = F.leaky_relu(self.conv3(h_2_3)) #52
h_3_2 = F.max_pooling_2d(h_3_1, ksize=(3, 1), stride=1, pad=(0, 0)) # 50
h_3_3 = self.bn3(h_3_2)

h_4_1 = F.leaky_relu(self.conv4(h_3_3)) # 24
h_4_2 = F.max_pooling_2d(h_4_1, ksize=(3, 1), stride=1, pad=(0, 0)) # 22
h_4_3 = self.bn4(h_4_2)

h_5_1 = F.leaky_relu(self.conv5(h_4_3)) # 20
h_5_2 = F.max_pooling_2d(h_5_1, ksize=(3, 1), stride=1, pad=(0, 0)) # 18
h_5_3 = self.bn5(h_5_2)

h_6_1 = F.leaky_relu(self.conv6(h_5_3)) # 16
h_6_2 = F.max_pooling_2d(h_6_1, ksize=(3, 1), stride=1, pad=(0, 0)) # 14
h_6_3 = self.bn6(h_6_2)

h7 = F.average_pooling_2d(h_6_3, ksize=(14, 1), stride=1, pad=(0, 0)) # op kernel

h8 = self.fc1(h7)
# h11 = F.relu(self.fc2(F.dropout(h10, ratio=self.dropout)))
# h12 = self.fc3(F.dropout(h11, ratio=self.dropout))

if chainer.config.train:
return h8
return F.softmax(h8)

[/python]

References:

  1. Char level CNN

Leave a Reply