172 173 174 175 176 177 178 179 180 # logger.info('self.schema') # logger.info(self.schema) for table in self.schema['tables']: if table['name'] == table_name: for index in table['indexes']: if index['type'] == 'PRIMARY': return index['name'] return None Hình 205 Mã nguồn chuyển đổi liệu từ định dạng JSON sang Redis 10 import json import os import re import subprocess from collections import OrderedDict import mysql.connector from pymongo import GEO2D, TEXT import redis from ckanext.mysql2mongodb.data_conv.core.utilities import open_connection_mongodb, load_mongodb_collection, drop_mongodb_database 11 from ckanext.mysql2mongodb.data_conv.core.helper import store_collection_to_DS 12 13 from ckanext.mysql2mongodb.data_conv.core.interfaces.AbstractSchemaConversion import AbstractSchemaConversion 14 from ckanext.mysql2mongodb.data_conv.core.helper import store_collection_to_DS 15 16 class RedisSchemaImportConversion(AbstractSchemaConversion): 17 def init (self): 18 super(RedisSchemaImportConversion, self). init () 19 # Define a name for schema file, which will be place at intermediate folder 20 21 def set_config(self, schema_conv_init_option, schema_conv_output_option): 22 self.schema_conv_init_option = schema_conv_init_option 23 self.schema_conv_output_option = schema_conv_output_option 24 25 def get(self): 26 return None 27 28 def run(self): 29 drop_mongodb_database(self.schema_conv_output_option) 30 redisSchema = {} 31 redisSchema['database-name'] = self.schema_conv_init_option.dbname 32 redisSchema["database-version"] = '' 33 redisSchema["schema"] = '' 34 redisSchema["tables"] = [] 35 redisSchema["foreign-keys"] = [] 36 redisSchema["constraints"] = [] 37 redisSchema["trigger"] = [] 38 redisSchema["misc"] = {} 39 40 table_info = {} 41 table_info["name"] = "redis_string" 42 table_info["misc"] = {} 43 table_info["columns"] = [ 44 { 45 "name": "redis_key", 46 "column-type": "TEXT", 47 "misc": {}, 48 "column-original": "redis_key" 49 }, 323 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 { "name": "redis_value", "column-type": "TEXT", "misc": {}, "column-original": "redis_value" } ] table_info["indexes"] = [{"type": "PRIMARY", "name": "redis_key"}] redisSchema["tables"].append(table_info) table_info = {} table_info["name"] = "redis_list" table_info["misc"] = {} table_info["columns"] = [ { "name": "redis_key", "column-type": "TEXT", "misc": {}, "column-original": "redis_key" }, { "name": "redis_value", "column-type": "TEXT", "misc": {}, "column-original": "redis_value" } ] table_info["indexes"] = [{"type": "PRIMARY", "name": "redis_key"}] redisSchema["tables"].append(table_info) table_info = {} table_info["name"] = "redis_set" table_info["misc"] = {} table_info["columns"] = [ { "name": "redis_key", "column-type": "TEXT", "misc": {}, "column-original": "redis_key" }, { "name": "redis_value", "column-type": "TEXT", "misc": {}, "column-original": "redis_value" } ] table_info["indexes"] = [{"type": "PRIMARY", "name": "redis_key"}] redisSchema["tables"].append(table_info) table_info = {} table_info["name"] = "redis_hash" table_info["misc"] = {} table_info["columns"] = [ { "name": "redis_key", "column-type": "TEXT", "misc": {}, "column-original": "redis_key" }, { "name": "redis_value", "column-type": "TEXT", "misc": {}, "column-original": "redis_value" 324 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 } ] table_info["indexes"] = [{"type": "PRIMARY", "name": "redis_key"}] redisSchema["tables"].append(table_info) table_info = {} table_info["name"] = "redis_sortedSet" table_info["misc"] = {} table_info["columns"] = [ { "name": "redis_key", "column-type": "TEXT", "misc": {}, "column-original": "redis_key" }, { "name": "redis_value", "column-type": "TEXT", "misc": {}, "column-original": "redis_value" } ] table_info["indexes"] = [] redisSchema["tables"].append(table_info) self.schema = redisSchema return True def save(self): schema_collections = [ (self.schema_conv_output_option.dbname+"_schema", self.schema)] store_collection_to_DS( schema_collections, self.schema_conv_output_option.dbname) return True Hình 206 Mã nguồn quy trình chuyển đổi Schema từ định dạng Redis sang định dạng JSON 10 11 12 13 14 15 16 17 18 19 20 21 import import import import import import import import from from from from from from from sys json bson re time pprint logging redis bson.decimal128 import Decimal128 decimal import Decimal bson import BSON datetime import datetime multiprocessing import Pool itertools import repeat bson.objectid import ObjectId import mysql.connector from pymongo import MongoClient from ckanext.mysql2mongodb.data_conv.core.interfaces.AbstractDataConversion import AbstractDataConversion 22 from ckanext.mysql2mongodb.data_conv.core.helper import store_collection_to_DS, get_schema_from_DS 325 23 from ckanext.mysql2mongodb.data_conv.core.utilities import open_connection_mongodb, load_mongodb_collection, store_json_to_mongodb 24 25 # Get Airflow Logger 26 logger = logging.getLogger("airflow.task") 27 28 def extract_dict(selected_keys): 29 """ 30 Extract selected-by-key fields from dict 31 This function is used as iteration of Python map() function 32 """ 33 def extract_dict(input_dict): 34 output_dict = {} 35 for key in selected_keys: 36 output_dict[str(key)] = input_dict[str(key)] 37 return output_dict 38 return extract_dict 39 40 def open_connection_redis(host, username, password, dbname=None): 41 try: 42 db_connection = redis.Redis(host=host, port=6379, db=0) 43 db_connection.ping() 44 print("Connected to Redis", host) 45 46 return db_connection 47 48 except Exception as e: 49 print( 50 f"Error while connecting to Redis on {host}! Please check again.") 51 print(e) 52 raise e 53 54 def open_connection_mongodb(schema_conv_init_option): 55 """ 56 Set up a connection to MongoDB database 57 Return a MongoClient object if success 58 """ 59 connection_string = f"mongodb://{schema_conv_init_option.host}:{schema_conv_init_option.port}/" 60 try: 61 # Making connection 62 mongo_client = MongoClient( 63 connection_string, username=schema_conv_init_option.username, password=schema_conv_init_option.password) 64 # Select database 65 db_connection = mongo_client[schema_conv_init_option.dbname] 66 return db_connection 67 except Exception as e: 68 print( 69 f"Error while connecting to MongoDB database {schema_conv_init_option.dbname}! Re-check connection or name of database.") 70 print(e) 71 raise e 72 73 class RedisDataExportConversion (AbstractDataConversion): 74 """ 75 DataConversion Database data class 76 This class is used for: 77 - Converting and migrating data from MySQL to MongoDB 78 - Validating if converting is correct, using re-converting method 79 """ 80 81 def init (self): 82 super(RedisDataExportConversion, self). init () 83 326 84 85 86 87 88 def set_config(self, schema_conv_init_option, schema_conv_output_option): """ To set config, you need to provide: - schema_conv_init_option: instance of class ConvInitOption, which specified connection to "Input" database (MySQL) - schema_conv_output_option: instance of class ConvOutputOption, which specified connection to "Out" database (MongoDB) - schema: MySQL schema object which was loaded from MongoDB """ # set config self.schema_conv_init_option = schema_conv_init_option self.schema_conv_output_option = schema_conv_output_option 89 90 91 92 93 94 95 def set_schema(self, schema): 96 self.schema = schema 97 98 def get(self): 99 return self.schema 100 101 def run(self): 102 temp_schema = get_schema_from_DS( 103 self.schema_conv_output_option.dbname) 104 self.set_schema(temp_schema) 105 106 dbJson = list() 107 redis = open_connection_redis(self.schema_conv_output_option.host, self.schema_conv_output_option.username, 108 self.schema_conv_output_option.password, self.schema_conv_output_option.dbname) 109 110 mongoConnection = open_connection_mongodb(self.schema_conv_init_option) 111 112 table_names = [] 113 for table in self.schema['tables']: 114 table_names.append(table['name']) 115 # for collection_name in self.schema.get_tables_name_list(): 116 117 for collection_name in table_names: 118 datas = load_mongodb_collection( 119 self.schema_conv_init_option, 120 collection_name 121 ) 122 123 for table in self.schema['tables']: 124 if table['name'] == collection_name: 125 columns_list = table['columns'] 126 127 columns_name_list = [] 128 for column in columns_list: 129 if column['column-type'] != '_ARRAY_OF_SCALARS': 130 columns_name_list.append(column['name']) 131 132 columns_num = len(columns_name_list) 133 134 columns_name_sql = ["%s"] * columns_num 135 136 for data in datas: 137 row = {} 138 pk_name = self.get_primary_key_name(collection_name) 139 for key in columns_name_list: 140 if key in data.keys(): 141 dtype = type(data[key]) 142 if dtype is Decimal128: 143 cell_data = str(data[key].to_decimal()) 144 elif dtype is list: 327 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 # return cell_data = ",".join(str(data[key])) elif dtype is ObjectId: cell_data = str(data[key]) else: cell_data = str(data[key]) else: cell_data = None row.update({ key: cell_data }) insert_value = { "key": collection_name+" _"+str(data[pk_name]), "value": json.dumps(row) } for column in columns_list: if column['name'] == key: column_info = column # logger.info(insert_value) redis.set(insert_value["key"], insert_value["value"]) def save(self): pass def get_primary_key_name(self, table_name): # logger.info('self.schema') # logger.info(self.schema) for table in self.schema['tables']: if table['name'] == table_name: for index in table['indexes']: if index['type'] == 'PRIMARY': return index['name'] return None Hình 207 Mã nguồn quy trình chuyển đổi liệu từ định dạng Redis sang JSON 10 11 12 13 14 15 16 17 18 import json import os import re import subprocess from collections import OrderedDict import mysql.connector from pymongo import GEO2D, TEXT, MongoClient import logging import pprint from from from from from pymongo_schema.compare import compare_schemas_bases pymongo_schema.export import transform_data_to_file pymongo_schema.extract import extract_pymongo_client_schema pymongo_schema.filter import filter_mongo_schema_namespaces, init_filtered_schema pymongo_schema.tosql import mongo_schema_to_mapping from ckanext.mysql2mongodb.data_conv.core.interfaces.AbstractSchemaConversion import AbstractSchemaConversion 19 from ckanext.mysql2mongodb.data_conv.core.utilities import drop_mongodb_database 20 21 from ckanext.mysql2mongodb.data_conv.core.database_connection import ConvInitOption, ConvOutputOption 22 from ckanext.mysql2mongodb.data_conv.core.helper import read_package_config, read_database_config, getDSClient, store_collection_to_DS 23 328 24 _logger = logging.getLogger("airflow.task") 25 26 class MongoSchemaImportConversion(AbstractSchemaConversion): 27 28 def init (self): 29 super(MongoSchemaImportConversion, self). init () 30 # Define a name for schema file, which will be place at intermediate folder 31 self.schema_filename = "schema.json" 32 33 def set_config(self, schema_conv_init_option, schema_conv_output_option): 34 """ 35 To set up connections, you need to provide: 36 - schema_conv_init_option:_ instance of class ConvInitOption, which specified connection to "Input" database (MySQL) 37 - schema_conv_output_option: instance of class ConvOutputOption, which specified connection to "Out" database (MongoDB) 38 """ 39 self.schema_conv_init_option = schema_conv_init_option 40 self.schema_conv_output_option = schema_conv_output_option 41 42 def get(self): 43 return self.schema 44 45 def run(self): 46 drop_mongodb_database(self.schema_conv_output_option) 47 self.convertSchema() 48 return True 49 50 def extractSchema(self): 51 # extract schema from MongoDB 52 connection_string = f"mongodb://{self.schema_conv_init_option.host}:{self.schema_conv_init_option.port}/" 53 # Making connection 54 mongo_client = MongoClient( 55 connection_string, username=self.schema_conv_init_option.username, password=self.schema_conv_init_option.password) 56 57 schema = extract_pymongo_client_schema(mongo_client) 58 59 return schema 60 61 def convertSchema(self): 62 schema = self.extractSchema() 63 64 tables = schema[self.schema_conv_output_option.dbname] 65 _logger.info(schema) 66 67 for table_name in tables.keys(): 68 self.process_column_type(tables[table_name]) 69 # for column_name in tables[table_name]['object'].keys(): 70 # column = tables[table_name]['object'][column_name] 71 # if column['type'] == 'general_scalar': 72 # column['type'] = 'string' 73 74 ori_schema = { 75 self.schema_conv_output_option.dbname: tables 76 } 77 _logger.info(ori_schema) 78 79 for temp in ori_schema[self.schema_conv_output_option.dbname].keys(): 80 for att_name in ori_schema[self.schema_conv_output_option.dbname][temp]['object'].keys(): 81 if ori_schema[self.schema_conv_output_option.dbname][temp]['object'][att_name]['type'] == 'unknown': 329 82 ori_schema[self.schema_conv_output_option.dbname][temp]['object'][att_na me]['type'] = 'string' 83 84 mapping = mongo_schema_to_mapping(ori_schema) 85 _logger.info(mapping) 86 87 dbs_name = [*mapping][0] 88 89 result = {} 90 91 connection = getDSClient() 92 serverVersion = getDSClient().server_info()["version"] 93 94 result.update({ 95 "database-name": "Mongo", 96 "database-version": serverVersion, 97 "misc": {}, 98 'constraints': [], 99 'trigger': [] 100 }) 101 102 tables = [] 103 fk = [] 104 105 for collection_name in mapping[dbs_name].keys(): 106 index = [] 107 table = { 108 'name': collection_name, 109 "misc": { 110 } 111 } 112 columns = [] 113 for column_name in mapping[dbs_name][collection_name].keys(): 114 if column_name == 'pk': 115 pk_name = mapping[dbs_name][collection_name][column_name] 116 117 if pk_name != "_id" and "_id" not in mapping[dbs_name][collection_name].keys(): 118 pk_name = "_id" 119 column = { 120 "name": "_id", 121 "column-type": 'TEXT', 122 "misc": {} 123 } 124 columns.append(column) 125 table['columns'] = columns 126 127 temp = { 128 'type': 'PRIMARY', 129 'name': pk_name 130 } 131 index.append(temp) 132 table['indexes'] = index 133 134 if mapping[dbs_name][collection_name][column_name] == "_id_postgres": 135 column = { 136 "name": mapping[dbs_name][collection_name][column_name], 137 "column-type": 'TEXT', 138 "misc": {} 139 } 140 columns.append(column) 141 table['columns'] = columns 142 continue 143 144 attribute = mapping[dbs_name][collection_name][column_name] 330 145 value = attribute['type'] 146 if attribute['type'] == "TIMESTAMP": 147 value = 'DATETIME' 148 column = { 149 "name": attribute['dest'] if attribute.get('dest') else column_name, 150 "column-type": value, 151 "misc": {}, 152 "column-original": column_name.replace(".", " ") 153 } 154 if attribute.get('fk'): 155 if attribute.get('type') == '_ARRAY': 156 pk = '' 157 for index in table['indexes']: 158 if index['type'] == 'PRIMARY': 159 pk = index['name'] 160 item = { 161 'name': str(attribute['dest'])+' '+str(attribute['fk'])+" fk", 162 "column_references": { 163 "foreign-key-column": attribute['fk'], 164 "foreign-key-table": attribute['dest'], 165 "primary-key-column": pk, 166 "primary-key-table": collection_name 167 } 168 } 169 fk.append(item) 170 continue 171 172 column.update({ 173 'name': attribute['valueField'], 174 "refer-key-column": attribute['fk'], 175 'refer-key-table': attribute['dest'] 176 }) 177 item = { 178 'name': str(attribute['valueField'])+' '+str(attribute['fk'])+" fk", 179 "column_references": { 180 "foreign-key-column": attribute['fk'], 181 "foreign-key-table": attribute['dest'], 182 "primary-key-column": attribute['valueField'], 183 "primary-key-table": collection_name 184 } 185 } 186 fk.append(item) 187 188 columns.append(column) 189 table['columns'] = columns 190 tables.append(table) 191 192 result.update({ 193 'tables': tables, 194 'foreign-keys': fk, 195 196 }) 197 198 self.schema = (dbs_name+"_schema", result) 199 return True 200 201 def save(self): 202 store_collection_to_DS( 203 [self.schem a], self.schema_conv_output_option.dbname) 204 205 def process_column_type(self, table): 206 _logger.info('process_column_type') 207 _logger.info(table) 331 208 209 210 211 212 213 214 215 if table.get('object'): for obj in table.get('object').keys(): self.process_column_type(table.get('object')[obj]) else: if table['type'] == 'general_scalar': table['type'] = 'string' Hình 208 Mã nguồn quy trình chuyển đổi Schema từ MongoDB sang JSON 10 11 12 13 14 15 16 17 18 19 20 import import import import import import import from from from from from from from sys json bson re time pprint logging bson.decimal128 import Decimal128 decimal import Decimal bson import BSON datetime import datetime multiprocessing import Pool itertools import repeat bson.objectid import ObjectId import mysql.connector from pymongo import MongoClient from ckanext.mysql2mongodb.data_conv.core.interfaces.AbstractDataConversion import AbstractDataConversion 21 from ckanext.mysql2mongodb.data_conv.core.utilities import flatten 22 from ckanext.mysql2mongodb.data_conv.core.helper import store_collection_to_DS 23 24 # Get Airflow Logger 25 logger = logging.getLogger("airflow.task") 26 27 class MongoDataImportConversion (AbstractDataConversion): 28 """ 29 DataConversion Database data class 30 This class is used for: 31 - Converting and migrating data from MySQL to MongoDB 32 - Validating if converting is correct, using re-converting method 33 """ 34 35 def init (self): 36 super(MongoDataImportConversion, self). init () 37 38 def set_config(self, schema_conv_init_option, schema_conv_output_option, schema): 39 """ 40 To set config, you need to provide: 41 - schema_conv_init_option: instance of class ConvInitOption, which specified connection to "Input" database 42 - schema_conv_output_option: instance of class ConvOutputOption, which specified connection to "Out" database 43 - schema: MySQL schema object which was loaded from MongoDB 44 """ 45 self.schema = schema.get()[1] 46 self.schema.pop('_id', None) 47 # set config 48 self.schema_conv_init_option = schema_conv_init_option 49 self.schema_conv_output_option = schema_conv_output_option 332 50 51 52 53 54 55 56 def run(self): self.migrate_to_datastore() return True def migrate_to_datastore(self): mongo_connection_string = f"mongodb://{self.schema_conv_init_option.host}:{self.schema_conv_init_option.port}/" 57 # Making connection 58 mongo_client = MongoClient( 59 mongo_connection_string, username=self.schema_conv_init_option.username, password=self.schema_conv_init_option.password)[self.schema_conv_init_option.dbname] 60 61 collection_names = [ 62 collection for collection in mongo_client.collection_names()] 63 64 data = {} 65 for name in collection_names: 66 for doc in mongo_client[name].find({}): 67 self.helper(data, name, doc) 68 69 for name in data.keys(): 70 logger.info(data[name]) 71 store_collection_to_DS( 72 [(name, data[name])], self.schema_conv_output_option.dbname) 73 74 def helper(self, data, name, collection): 75 flatten_collection = flatten(collection) 76 77 column_names = flatten_collection.keys() 78 row = {} 79 80 # logger.info(collection) 81 # logger.info(flatten_collection) 82 83 for col_name in column_names: 84 column_info = self.get_column_info(name, col_name) 85 logger.info(col_name) 86 # logger.info(column_info) 87 88 if not column_info: 89 sub_collection_name = name + ' ' + col_name 90 pk = self.get_primary_key_name(name) 91 fk = self.get_fk_info(sub_collection_name, name, pk) 92 93 logger.info(flatten_collection) 94 95 for coll in flatten_collection[col_name]: 96 logger.info(col_name) 97 sub_collection = coll.copy() 98 logger.info(sub_collection) 99 sub_collection.update( 100 {fk['column_references']['foreign-key-column']: flatten_collection[pk]}) 101 102 self.helper(data, sub_collection_name, sub_collection) 103 continue 104 105 if column_info['column-type'] == '_ARRAY_OF_SCALARS': 106 refer_table = column_info['refer-key-table'] 107 # logger.info(column_info['refer-key-column']) 108 # logger.info(self.get_primary_key_name(name)) 109 # logger.info(collection) 110 values = flatten_collection[col_name] 111 for value in values: 333 112 temp = {} 113 # logger.info(flatten_collection) 114 temp.update({ 115 column_info['refer-key-column']: flatten_collection[self.get_primary_key_name(name)], 116 column_info['name']: value 117 }) 118 if data.get(refer_table): 119 data[refer_table].append(temp) 120 else: 121 data.update({refer_table: [temp]}) 122 else: 123 temp = flatten_collection[col_name] 124 if column_info['column-type'] == 'TIMESTAMP': 125 temp = flatten_collection[col_name] 126 elif column_info['column-type'] == 'DATETIME': 127 temp = temp.replace(microsecond=0) 128 if type(temp) == ObjectId: 129 temp = str(temp) 130 row.update({col_name: temp}) 131 # logger.info(row) 132 133 if data.get(name): 134 # logger.info(data.get(name)) 135 data[name].append(row) 136 # logger.info(row) 137 else: 138 data.update({name: [row]}) 139 140 def get_column_info(self, table_name, col_name): 141 logger.info(self.schema) 142 for table in self.schema['tables']: 143 if table['name'] == table_name: 144 for column in table['columns']: 145 if column.get('column-original') and column['column-original'] == col_name: 146 return column 147 return None 148 149 def get_primary_key_name(self, table_name): 150 for table in self.schema['tables']: 151 if table['name'] == table_name: 152 for index in table['indexes']: 153 if index['type'] == 'PRIMARY': 154 return index['name'] 155 return None 156 157 def get_fk_info(self, fk_table_name, pk_table_name, pk_name): 158 for fk in self.schema['foreign-keys']: 159 column_references = fk['column_references'] 160 if column_references['foreign-key-table'] == fk_table_name and column_references['primary-key-table'] == pk_table_name and column_references['primary-keycolumn'] == pk_name: 161 return fk 162 Hình 209 Mã nguồn quy trình chuyển đổi liệu từ MongoDB sang JSON import json import os import re import subprocess from collections import OrderedDict 334 import mysql.connector from pymongo import GEO2D, TEXT from ckanext.mysql2mongodb.data_conv.core.utilities import open_connection_mongodb, load_mongodb_collection 10 from ckanext.mysql2mongodb.data_conv.core.helper import store_collection_to_DS 11 12 from ckanext.mysql2mongodb.data_conv.core.interfaces.AbstractSchemaConversion import AbstractSchemaConversion 13 14 class ImageSchemaImportConversion(AbstractSchemaConversion): 15 16 def init (self): 17 super(ImageSchemaImportConversion, self). init () 18 # Define a name for schema file, which will be place at intermediate folder 19 20 def set_config(self, schema_conv_init_option, schema_conv_output_option): 21 """ 22 To set up connections, you need to provide: 23 - schema_conv_init_option:_ instance of class ConvInitOption, which specified connection to "Input" database (MySQL) 24 - schema_conv_output_option: instance of class ConvOutputOption, which specified connection to "Out" database (MongoDB) 25 """ 26 self.schema_conv_init_option = schema_conv_init_option 27 self.schema_conv_output_option = schema_conv_output_option 28 29 def get(self): 30 return None 31 32 def run(self): 33 return True 34 35 def save(self): 36 return True 37 38 Hình 210 Mã nguồn quy trình chuyển đổi schema từ tập tin ảnh sang JSON 10 11 12 13 14 15 16 17 18 19 20 21 import import import import import import import import import from from from from from from sys json bson re time pprint logging os os.path bson.decimal128 import Decimal128 decimal import Decimal bson import BSON datetime import datetime multiprocessing import Pool itertools import repeat from pymongo import MongoClient import gridfs from ckanext.mysql2mongodb.data_conv.core.interfaces.AbstractDataConversion import AbstractDataConversion 22 from ckanext.mysql2mongodb.data_conv.core.helper import store_collection_to_DS, getDSClient 335 23 24 logger = logging.getLogger("airflow.task") 25 26 class ImageDataImportConversion (AbstractDataConversion): 27 28 def init (self): 29 super(ImageDataImportConversion, self). init () 30 31 def set_config(self, schema_conv_init_option, schema_conv_output_option, schema): 32 """ 33 To set config, you need to provide: 34 - schema_conv_init_option: instance of class ConvInitOption, which specified connection to "Input" database 35 - schema_conv_output_option: instance of class ConvOutputOption, which specified connection to "Out" database 36 - schema: schema object acquired from Schema Converter 37 """ 38 self.schema = schema.get() 39 40 # set config 41 self.schema_conv_init_option = schema_conv_init_option 42 self.schema_conv_output_option = schema_conv_output_option 43 44 def set_schema(self, schema): 45 self.schema = schema 46 47 def get(self): 48 return self.schema 49 50 def run(self): 51 location = f'''./images/{self.schema_conv_init_option.dbname}''' 52 connection = getDSClient() 53 db = connection[self.schema_conv_init_option.dbname] 54 fs = gridfs.GridFS(db) 55 56 files = self.getListOfFiles(location) 57 for file in files: 58 datafile = open(file, "rb") 59 thedata = datafile.read() 60 filename = os.path.basename(file) 61 logger.info(filename) 62 stored = fs.put(thedata, filename=filename) 63 64 def save(self): 65 pass 66 67 def getListOfFiles(self, dirName): 68 # create a list of file and sub directories 69 # names in the given directory 70 listOfFile = os.listdir(dirName) 71 allFiles = list() 72 # Iterate over all the entries 73 for entry in listOfFile: 74 # Create full path 75 fullPath = os.path.join(dirName, entry) 76 # If entry is a directory then get the list of files in this directory 77 if os.path.isdir(fullPath): 78 allFiles = allFiles + self.getListOfFiles(fullPath) 79 else: 80 allFiles.append(fullPath) 81 82 return allFiles 83 84 336 Hình 211 Mã nguồn quy trình chuyển đổi liệu từ tập tin ảnh sang JSON 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 #!/usr/bin/env python3 # database_config.py: Classes which are used for construct database connection import os import subprocess from database_function import DatabaseFunctions, DatabaseFunctionsOptions from pathlib import Path import os import logging from pymongo import MongoClient logger = logging.getLogger("airflow.task") class CSVFunctions(DatabaseFunctions): """ Class Conversion Initialized Connection Option This class is usually used for CSV connection """ def init (self, options: DatabaseFunctionsOptions): super(CSVFunctions, self). init (options) def restore(self, filePath): cmd = f'''mysql -h {self.options.host} user={self.options.username} -password={self.options.password} -e \"DROP DATABASE {self.options.dbname}\"''' 26 subprocess.run([cmd], check=True, shell=True) 27 logger.info(cmd) 28 29 cmd = f'''mysql -h {self.options.host} user={self.options.username} -password={self.options.password} -e \"CREATE DATABASE IF NOT EXISTS {self.options.dbname}\"''' 30 subprocess.run([cmd], check=True, shell=True) 31 logger.info(cmd) 32 33 path = Path(filePath) 34 destPath = os.path.join(str(path.parent), str(path.stem)+".sql") 35 cmd = f'''csvsql dialect mysql snifflimit 100000 {filePath} > {destPath}''' 36 subprocess.run([cmd], check=True, shell=True) 37 logger.info(cmd) 38 39 cmd = f'''mysql -h {self.options.host} user={self.options.username} -password={self.options.password} {self.options.dbname} < "{destPath}"''' 40 subprocess.run([cmd], check=True, shell=True) 41 logger.info(cmd) 42 43 cmd = f'''mysqlimport -h {self.options.host} user={self.options.username} -password={self.options.password} ignore-lines=1 fields-terminated-by=, local {self.options.dbname} {filePath}''' 44 subprocess.run([cmd], check=True, shell=True) 45 logger.info(cmd) 46 47 def backup(self, filePath): 48 pass 49 50 Hình 212 Mã nguồn trình tiền xử lý liệu CSV 337