#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import os
from functools import reduce
def is_valid_tac(tac: str):
if len(tac) != 8:
return False
try:
int(tac)
return True
except ValueError:
return False
def get_tac_filtered_df(df):
df = df[df.tac.notnull()]
df = df[df.tac.apply(is_valid_tac)]
return df
def join_columns(df, sep, *columns):
col1, col2 = columns
df[col1].fillna('', inplace=True)
df[col2].fillna('', inplace=True)
df[col1] = df[[col1, col2]].apply(
lambda x: sep.join(x) if len(''.join(x)) else float('NaN'), axis=1)
df.drop(col2, axis=1, inplace=True)
return df
def join_to_final(df1, df2):
final_data = pd.concat(([df1, df2]), sort=False) # .reset_index(drop=True)
final_data.drop_duplicates(subset='tac', keep="first", inplace=True)
final_data.reset_index(drop=True, inplace=True)
final_data.index.name = "index"
return final_data
def save_file_with_int_tac(df, name):
df.to_csv(name + '.csv', sep=';')
def get_not_common(df1, df2):
common = df1.merge(df2, on=['tac', 'tac'])
df = df1[(~df1.tac.isin(common.tac)) & (~df1.tac.isin(common.tac))]
return df.dropna(how='all')
sources_path = './sources'
tac_csv = os.path.join(sources_path, 'tac.csv')
tacdb_csv = os.path.join(sources_path, 'tacdb (1).csv')
imeidb_csv = os.path.join(sources_path, 'imeidb.csv')
tac_db_100000 = os.path.join(sources_path, 'tac-db-100000.csv')
stadtaus = os.path.join(sources_path, 'www.STADTAUS.com_TW06_IMEI_csv_v1608.csv')
tac_data = pd.read_csv(tac_csv, engine='python', sep=';', skipinitialspace=True, encoding='ISO-8859-1',
dtype={"tac": str})
tac_data.drop('id', axis=1, inplace=True)
tac_data.drop('version', axis=1, inplace=True)
tac_data.drop('last_update', axis=1, inplace=True)
tac_data = get_tac_filtered_df(tac_data)
source = ['tac.csv', ] * len(tac_data)
tac_data['source'] = source
tac_final = tac_data
save_file_with_int_tac(tac_data, 'tac_with0')
tacdb_data = pd.read_csv(tacdb_csv, engine='python', sep=',', skiprows=1, skipinitialspace=True, encoding='ISO-8859-1',
dtype={"tac": str})
tacdb_data.drop('contributor', axis=1, inplace=True)
tacdb_data.drop('comment', axis=1, inplace=True)
tacdb_data.drop('gsmarena.1', axis=1, inplace=True)
tacdb_data = get_tac_filtered_df(tacdb_data)
tacdb_data.rename(columns={'name': 'manufacturer', 'name.1': 'model', 'aka': 'model2'},
inplace=True)
source = ['tacdb.csv', ] * len(tacdb_data)
tacdb_data['source'] = source
tacdb_final = tac_data
final_data = join_to_final(tac_data, tacdb_data)
save_file_with_int_tac(final_data, 'tac_tacdb_with0')
data_lists = []
tac_list_with_name2 = []
for line in open(imeidb_csv, encoding='ISO-8859-1'):
if not line.count(',\n'):
line.replace('\n', ',\n')
tac, manufacturer, *name = line.split(',')
if '\n' in name:
name.remove('\n')
if '' in name:
name.remove('')
if not name:
continue
name, name2 = name[0], name[1:]
if name2:
name2 = ' '.join(name2)
else:
name2 = ''
name = name.replace('\n', '')
name2 = name2.replace('\n', '')
if name2 != '':
tac_list_with_name2.append(tac)
data_lists.append([tac, manufacturer, name, name2, 'imeidb.csv'])
tac_list = final_data.tac.to_list()
imeidb_data = pd.DataFrame.from_records(data_lists, columns=['tac', 'manufacturer', 'model', 'model2', 'source'])
imeidb_data = get_tac_filtered_df(imeidb_data)
final_data = join_to_final(final_data, imeidb_data)
imeidb_final = imeidb_data
stadtaus_data = pd.read_csv(stadtaus, sep=';')
stadtaus_data.tac = stadtaus_data.tac.str.split("'").str[-1]
stadtaus_data = get_tac_filtered_df(stadtaus_data)
stadtaus_data.reset_index(inplace=True)
stadtaus_data = stadtaus_data.rename(columns={'index': 'old_index'})
stadtaus_data.index.name = 'index'
stadtaus_data['source'] = ['STADTAUS.com.csv'] * len(stadtaus_data)
save_file_with_int_tac(stadtaus_data, 'cleaned_stadtaus_with0')
adapted_stadtaus_data = stadtaus_data.loc[:, ['tac', 'handset_brand', 'handset_model', 'source']]
adapted_stadtaus_data.rename(columns={'handset_brand': 'manufacturer', 'handset_model': 'model'}, inplace=True)
adapted_stadtaus_final = adapted_stadtaus_data
final_data = join_to_final(final_data, adapted_stadtaus_data)
#save_file_with_int_tac(final_data, 'tac_tacdb_imeidb_stadtaus_with0')
tac_db_100000_data = pd.read_csv(tac_db_100000, sep=';')
tac_db_100000_data.rename(columns={'IMEI_part': 'tac', 'IMEI_descr': 'manufacturer_and_name'}, inplace=True)
print(len(tac_db_100000_data))
tac_db_100000_data = get_tac_filtered_df(tac_db_100000_data)
print(len(tac_db_100000_data))
tac_db_100000_data.dropna(inplace=True)
print(len(tac_db_100000_data))
tac_db_100000_final = tac_db_100000_data
df1, df2 = final_data, tac_db_100000_data
cond = df2.tac.isin(df1.tac) == True
not_common = df2.drop(df2[cond].index)
save_file_with_int_tac(not_common, '100k_not_common')
# final_tac = final_data.tac.to_list()
# print(get_not_common(final_data, tac_db_100000_data))
#
# #null
def merge_it(df1, df2):
df = pd.merge(df1, df2, on=['tac'], how='inner')
return df
df_1 = merge_it(tac_final.loc[:, ['tac', 'manufacturer']], tacdb_final.loc[:, ['tac','manufacturer']])
df_2 = merge_it(df_1, imeidb_final.loc[:, ['tac','manufacturer']])
df_3 = merge_it(df_2, adapted_stadtaus_final.loc[:, ['tac','manufacturer']])
df_4_final = merge_it(df_3, tac_db_100000_final)
df_4_final.columns = ['tac', 'manufacturer_tac', 'manufacturer_tacdb', 'manufacturer_imeidb',
'manufacturer_adapted_stadtaus', 'manufacturer_db_100000']
save_file_with_int_tac(df_4_final, 'all_commons')
#print(merge_it(tac_final.loc[:, ['tac','manufacturer']], tacdb_final.loc[:, ['tac','manufacturer']]))