#!/usr/bin/env python # -*- coding: utf-8 -*- import pandas as pd import os from functools import reduce def is_valid_tac(tac: str): if len(tac) != 8: return False try: int(tac) return True except ValueError: return False def get_tac_filtered_df(df): df = df[df.tac.notnull()] df = df[df.tac.apply(is_valid_tac)] return df def join_columns(df, sep, *columns): col1, col2 = columns df[col1].fillna('', inplace=True) df[col2].fillna('', inplace=True) df[col1] = df[[col1, col2]].apply( lambda x: sep.join(x) if len(''.join(x)) else float('NaN'), axis=1) df.drop(col2, axis=1, inplace=True) return df def join_to_final(df1, df2): final_data = pd.concat(([df1, df2]), sort=False) # .reset_index(drop=True) final_data.drop_duplicates(subset='tac', keep="first", inplace=True) final_data.reset_index(drop=True, inplace=True) final_data.index.name = "index" return final_data def save_file_with_int_tac(df, name): df.to_csv(name + '.csv', sep=';') def get_not_common(df1, df2): common = df1.merge(df2, on=['tac', 'tac']) df = df1[(~df1.tac.isin(common.tac)) & (~df1.tac.isin(common.tac))] return df.dropna(how='all') sources_path = './sources' tac_csv = os.path.join(sources_path, 'tac.csv') tacdb_csv = os.path.join(sources_path, 'tacdb (1).csv') imeidb_csv = os.path.join(sources_path, 'imeidb.csv') tac_db_100000 = os.path.join(sources_path, 'tac-db-100000.csv') stadtaus = os.path.join(sources_path, 'www.STADTAUS.com_TW06_IMEI_csv_v1608.csv') tac_data = pd.read_csv(tac_csv, engine='python', sep=';', skipinitialspace=True, encoding='ISO-8859-1', dtype={"tac": str}) tac_data.drop('id', axis=1, inplace=True) tac_data.drop('version', axis=1, inplace=True) tac_data.drop('last_update', axis=1, inplace=True) tac_data = get_tac_filtered_df(tac_data) source = ['tac.csv', ] * len(tac_data) tac_data['source'] = source tac_final = tac_data save_file_with_int_tac(tac_data, 'tac_with0') tacdb_data = pd.read_csv(tacdb_csv, engine='python', sep=',', skiprows=1, skipinitialspace=True, encoding='ISO-8859-1', dtype={"tac": str}) tacdb_data.drop('contributor', axis=1, inplace=True) tacdb_data.drop('comment', axis=1, inplace=True) tacdb_data.drop('gsmarena.1', axis=1, inplace=True) tacdb_data = get_tac_filtered_df(tacdb_data) tacdb_data.rename(columns={'name': 'manufacturer', 'name.1': 'model', 'aka': 'model2'}, inplace=True) source = ['tacdb.csv', ] * len(tacdb_data) tacdb_data['source'] = source tacdb_final = tac_data final_data = join_to_final(tac_data, tacdb_data) save_file_with_int_tac(final_data, 'tac_tacdb_with0') data_lists = [] tac_list_with_name2 = [] for line in open(imeidb_csv, encoding='ISO-8859-1'): if not line.count(',\n'): line.replace('\n', ',\n') tac, manufacturer, *name = line.split(',') if '\n' in name: name.remove('\n') if '' in name: name.remove('') if not name: continue name, name2 = name[0], name[1:] if name2: name2 = ' '.join(name2) else: name2 = '' name = name.replace('\n', '') name2 = name2.replace('\n', '') if name2 != '': tac_list_with_name2.append(tac) data_lists.append([tac, manufacturer, name, name2, 'imeidb.csv']) tac_list = final_data.tac.to_list() imeidb_data = pd.DataFrame.from_records(data_lists, columns=['tac', 'manufacturer', 'model', 'model2', 'source']) imeidb_data = get_tac_filtered_df(imeidb_data) final_data = join_to_final(final_data, imeidb_data) imeidb_final = imeidb_data stadtaus_data = pd.read_csv(stadtaus, sep=';') stadtaus_data.tac = stadtaus_data.tac.str.split("'").str[-1] stadtaus_data = get_tac_filtered_df(stadtaus_data) stadtaus_data.reset_index(inplace=True) stadtaus_data = stadtaus_data.rename(columns={'index': 'old_index'}) stadtaus_data.index.name = 'index' stadtaus_data['source'] = ['STADTAUS.com.csv'] * len(stadtaus_data) save_file_with_int_tac(stadtaus_data, 'cleaned_stadtaus_with0') adapted_stadtaus_data = stadtaus_data.loc[:, ['tac', 'handset_brand', 'handset_model', 'source']] adapted_stadtaus_data.rename(columns={'handset_brand': 'manufacturer', 'handset_model': 'model'}, inplace=True) adapted_stadtaus_final = adapted_stadtaus_data final_data = join_to_final(final_data, adapted_stadtaus_data) #save_file_with_int_tac(final_data, 'tac_tacdb_imeidb_stadtaus_with0') tac_db_100000_data = pd.read_csv(tac_db_100000, sep=';') tac_db_100000_data.rename(columns={'IMEI_part': 'tac', 'IMEI_descr': 'manufacturer_and_name'}, inplace=True) print(len(tac_db_100000_data)) tac_db_100000_data = get_tac_filtered_df(tac_db_100000_data) print(len(tac_db_100000_data)) tac_db_100000_data.dropna(inplace=True) print(len(tac_db_100000_data)) tac_db_100000_final = tac_db_100000_data df1, df2 = final_data, tac_db_100000_data cond = df2.tac.isin(df1.tac) == True not_common = df2.drop(df2[cond].index) save_file_with_int_tac(not_common, '100k_not_common') # final_tac = final_data.tac.to_list() # print(get_not_common(final_data, tac_db_100000_data)) # # #null def merge_it(df1, df2): df = pd.merge(df1, df2, on=['tac'], how='inner') return df df_1 = merge_it(tac_final.loc[:, ['tac', 'manufacturer']], tacdb_final.loc[:, ['tac','manufacturer']]) df_2 = merge_it(df_1, imeidb_final.loc[:, ['tac','manufacturer']]) df_3 = merge_it(df_2, adapted_stadtaus_final.loc[:, ['tac','manufacturer']]) df_4_final = merge_it(df_3, tac_db_100000_final) df_4_final.columns = ['tac', 'manufacturer_tac', 'manufacturer_tacdb', 'manufacturer_imeidb', 'manufacturer_adapted_stadtaus', 'manufacturer_db_100000'] save_file_with_int_tac(df_4_final, 'all_commons') #print(merge_it(tac_final.loc[:, ['tac','manufacturer']], tacdb_final.loc[:, ['tac','manufacturer']]))