Translation Word/Char Count Prediction (Part 3d)
Languages POR, SHO, SWA, TWI, YOR
Purpose
There is a relationship between the number of words (and the number of characters) in the source language and the target language. If this relationship can be established and captured in yet another model, such a model can be helpful in at least two ways:
- For training: Validate the alignment of two sentences (in a training example) by comparing their word size and/or character size
- For inference: Validate the word size and/or character size of a translated/proofread sentence
In this notebook we will continue to discover models for each language and to evaluate its use in the above roles.
Dataset and Variables
The dataset used in this notebook contains the following features:
- m_descriptor: Unique identifier of a document
- t_lan_E: Language of the translation (English is also considered a translation)
- t_version: Version of a translation
- s_rsen: Number of a sentence within a document
- c_id: Database primary key of a contribution
- e_content_E: Text content of an English contribution
- chars_E: Number of characters in an English contribution
- words_E: Number of words in an English contribution
- t_lan_V: Language of the translation
- e_top: N/A
- be_top: N/A
- c_created_at: Creation time of a contribution
- c_kind: Kind of a contribution
- c_base: N/A
- a_role: N/A
- u_name: N/A
- e_content_V: Text content of a translated contribution
- chars_V: Number of characters in a translated contribution
- words_V: Number of words in a translated contribution
from pathlib import Path
import pandas as pd
%matplotlib inline
!python --version
PATH = Path(base_dir + './'); #PATH
import plotly.express as px
df = pd.read_csv(f'{PATH}/PredictTranslationWordAndCharCount/PredictTranslationWordAndCharCount_2-POR-output.csv', sep='~')
df.loc[:5, df.columns.isin(['m_descriptor','t_lan_E','t_version','s_rsen','c_id','chars_E','words_E','t_lan_V','chars_V','words_V'])]
fig = px.scatter(data_frame=df, x='chars_E', y='chars_V', color='t_lan_V',
title='Translation Characters vs English Characters',
opacity=.5,
hover_data=['m_descriptor','t_lan_V','s_rsen'],
labels={'m_descriptor':'Descriptor','t_lan_V':'','s_rsen':'Sentence No'})
fig.show()
#outlier
pd.set_option('display.max_colwidth',20)
outdf = df[(df['m_descriptor']=='1964-0621') & (df['s_rsen']==1125)]
outdf
pd.set_option('display.max_colwidth',1000)
print(outdf.loc[:,['e_content_E']].values[0][0])
print(outdf.loc[:,['e_content_V']].values[0][0])
This data-point is an outlier due to the over-translation.
df = df.drop(2989)
#outlier
pd.set_option('display.max_colwidth',20)
outdf = df[(df['m_descriptor']=='1949-0718') & (df['s_rsen']==458)]
outdf
pd.set_option('display.max_colwidth',1000)
print(outdf.loc[:,['e_content_E']].values[0][0])
print(outdf.loc[:,['e_content_V']].values[0][0])
df = df.drop(1622)
#outlier
pd.set_option('display.max_colwidth',20)
outdf = df[(df['m_descriptor']=='1965-1031y') & (df['s_rsen']==632)]
outdf
pd.set_option('display.max_colwidth',1000)
print(outdf.loc[:,['e_content_E']].values[0][0])
print(outdf.loc[:,['e_content_V']].values[0][0])
df = df.drop(4969)
#outlier
pd.set_option('display.max_colwidth',20)
outdf = df[(df['m_descriptor']=='1965-1031y') & (df['s_rsen']==426)]
outdf
pd.set_option('display.max_colwidth',1000)
print(outdf.loc[:,['e_content_E']].values[0][0])
print(outdf.loc[:,['e_content_V']].values[0][0])
df = df.drop(4763)
#outlier
pd.set_option('display.max_colwidth',20)
outdf = df[(df['m_descriptor']=='1965-1031y') & (df['s_rsen']==534)]
outdf
pd.set_option('display.max_colwidth',1000)
print(outdf.loc[:,['e_content_E']].values[0][0])
print(outdf.loc[:,['e_content_V']].values[0][0])
df = df.drop(4871)
#outlier
pd.set_option('display.max_colwidth',20)
outdf = df[(df['m_descriptor']=='1965-1031y') & (df['s_rsen']==54)]
outdf
pd.set_option('display.max_colwidth',1000)
print(outdf.loc[:,['e_content_E']].values[0][0])
print(outdf.loc[:,['e_content_V']].values[0][0])
df = df.drop(4391)
#outlier
pd.set_option('display.max_colwidth',20)
outdf = df[(df['m_descriptor']=='1965-0117') & (df['s_rsen']==164)]
outdf
pd.set_option('display.max_colwidth',1000)
print(outdf.loc[:,['e_content_E']].values[0][0])
print(outdf.loc[:,['e_content_V']].values[0][0])
df = df.drop(3243)
#outlier
pd.set_option('display.max_colwidth',20)
outdf = df[(df['m_descriptor']=='1965-1031y') & (df['s_rsen']==191)]
outdf
pd.set_option('display.max_colwidth',1000)
print(outdf.loc[:,['e_content_E']].values[0][0])
print(outdf.loc[:,['e_content_V']].values[0][0])
df = df.drop(4528)
fig = px.scatter(data_frame=df, x='words_E', y='words_V', color='t_lan_V',
title='Translation Words vs English Words',
opacity=.5,
hover_data=['m_descriptor','t_lan_V','s_rsen'],
labels={'m_descriptor':'Descriptor','t_lan_V':'','s_rsen':'Sentence No'})
fig.show()
df.to_csv (f'{PATH}/PredictTranslationWordAndCharCount/PredictTranslationWordAndCharCount_3-POR-output.csv', sep='~', index = False, header=True)
df = pd.read_csv(f'{PATH}/PredictTranslationWordAndCharCount/PredictTranslationWordAndCharCount_2-SHO-output.csv', sep='~')
df.loc[:5, df.columns.isin(['m_descriptor','t_lan_E','t_version','s_rsen','c_id','chars_E','words_E','t_lan_V','chars_V','words_V'])]
fig = px.scatter(data_frame=df, x='chars_E', y='chars_V', color='t_lan_V',
title='Translation Characters vs English Characters',
opacity=.5,
hover_data=['m_descriptor','t_lan_V','s_rsen'],
labels={'m_descriptor':'Descriptor','t_lan_V':'','s_rsen':'Sentence No'})
fig.show()
# #outlier
# pd.set_option('display.max_colwidth',20)
# outdf = df[(df['m_descriptor']=='1965-0822x') & (df['s_rsen']==1)]
# outdf
# print(outdf.loc[:,['e_content_E']].values[0][0])
# print(outdf.loc[:,['e_content_V']].values[0][0])
fig = px.scatter(data_frame=df, x='words_E', y='words_V', color='t_lan_V',
title='Translation Words vs English Words',
opacity=.5,
hover_data=['m_descriptor','t_lan_V','s_rsen'],
labels={'m_descriptor':'Descriptor','t_lan_V':'','s_rsen':'Sentence No'})
fig.show()
df.to_csv (f'{PATH}/PredictTranslationWordAndCharCount/PredictTranslationWordAndCharCount_3-SHO-output.csv', sep='~', index = False, header=True)
df = pd.read_csv(f'{PATH}/PredictTranslationWordAndCharCount/PredictTranslationWordAndCharCount_2-SWA-output.csv', sep='~')
df.loc[:5, df.columns.isin(['m_descriptor','t_lan_E','t_version','s_rsen','c_id','chars_E','words_E','t_lan_V','chars_V','words_V'])]
fig = px.scatter(data_frame=df, x='chars_E', y='chars_V', color='t_lan_V',
title='Translation Characters vs English Characters',
opacity=.5,
hover_data=['m_descriptor','t_lan_V','s_rsen'],
labels={'m_descriptor':'Descriptor','t_lan_V':'','s_rsen':'Sentence No'})
fig.show()
# #outlier
# pd.set_option('display.max_colwidth',20)
# outdf = df[(df['m_descriptor']=='1965-0822x') & (df['s_rsen']==1)]
# outdf
# print(outdf.loc[:,['e_content_E']].values[0][0])
# print(outdf.loc[:,['e_content_V']].values[0][0])
fig = px.scatter(data_frame=df, x='words_E', y='words_V', color='t_lan_V',
title='Translation Words vs English Words',
opacity=.5,
hover_data=['m_descriptor','t_lan_V','s_rsen'],
labels={'m_descriptor':'Descriptor','t_lan_V':'','s_rsen':'Sentence No'})
fig.show()
df.to_csv (f'{PATH}/PredictTranslationWordAndCharCount/PredictTranslationWordAndCharCount_3-SWA-output.csv', sep='~', index = False, header=True)
df = pd.read_csv(f'{PATH}/PredictTranslationWordAndCharCount/PredictTranslationWordAndCharCount_2-TWI-output.csv', sep='~')
df.loc[:5, df.columns.isin(['m_descriptor','t_lan_E','t_version','s_rsen','c_id','chars_E','words_E','t_lan_V','chars_V','words_V'])]
fig = px.scatter(data_frame=df, x='chars_E', y='chars_V', color='t_lan_V',
title='Translation Characters vs English Characters',
opacity=.5,
hover_data=['m_descriptor','t_lan_V','s_rsen'],
labels={'m_descriptor':'Descriptor','t_lan_V':'','s_rsen':'Sentence No'})
fig.show()
#outlier
pd.set_option('display.max_colwidth',20)
outdf = df[(df['m_descriptor']=='1965-1204') & (df['s_rsen']==1297)]
outdf
pd.set_option('display.max_colwidth',1000)
print(outdf.loc[:,['e_content_E']].values[0][0])
print(outdf.loc[:,['e_content_V']].values[0][0])
df = df.drop(5170)
fig = px.scatter(data_frame=df, x='words_E', y='words_V', color='t_lan_V',
title='Translation Words vs English Words',
opacity=.5,
hover_data=['m_descriptor','t_lan_V','s_rsen'],
labels={'m_descriptor':'Descriptor','t_lan_V':'','s_rsen':'Sentence No'})
fig.show()
df.to_csv (f'{PATH}/PredictTranslationWordAndCharCount/PredictTranslationWordAndCharCount_3-TWI-output.csv', sep='~', index = False, header=True)
df = pd.read_csv(f'{PATH}/PredictTranslationWordAndCharCount/PredictTranslationWordAndCharCount_2-YOR-output.csv', sep='~')
df.loc[:5, df.columns.isin(['m_descriptor','t_lan_E','t_version','s_rsen','c_id','chars_E','words_E','t_lan_V','chars_V','words_V'])]
fig = px.scatter(data_frame=df, x='chars_E', y='chars_V', color='t_lan_V',
title='Translation Characters vs English Characters',
opacity=.5,
hover_data=['m_descriptor','t_lan_V','s_rsen'])
fig.show()
# #outlier
# pd.set_option('display.max_colwidth',20)
# outdf = df[(df['m_descriptor']=='1965-0822x') & (df['s_rsen']==1)]
# outdf
# print(outdf.loc[:,['e_content_E']].values[0][0])
# print(outdf.loc[:,['e_content_V']].values[0][0])
fig = px.scatter(data_frame=df, x='words_E', y='words_V', color='t_lan_V',
title='Translation Words vs English Words',
opacity=.5,
hover_data=['m_descriptor','t_lan_V','s_rsen'],
labels={'m_descriptor':'Descriptor','t_lan_V':'','s_rsen':'Sentence No'})
fig.show()
df.to_csv (f'{PATH}/PredictTranslationWordAndCharCount/PredictTranslationWordAndCharCount_3-YOR-output.csv', sep='~', index = False, header=True)