#!/usr/bin/env python3 # Copyright © 2023 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import re import sqlite3 LOOKS_LIKE_DOSE = re.compile(r'([0-9/.,+%]*( ?(mg|mL|U|IU))?)+') # Open database con = sqlite3.connect('database.db') con.row_factory = sqlite3.Row cur = con.cursor() # Init schema cur.execute('DROP TABLE IF EXISTS pbs_mp_brand_name') cur.execute('CREATE TABLE pbs_mp_brand_name (id INTEGER PRIMARY KEY AUTOINCREMENT, mp_code STRING, brand_name STRING)') cur.execute('SELECT * FROM pbs_tpp LEFT JOIN (SELECT code, mp_code FROM pbs_mpp) AS pbs_mpp ON pbs_tpp.mpp_code = pbs_mpp.code LEFT JOIN (SELECT code, preferred_term as mp_preferred_term FROM pbs_mp) AS pbs_mp ON pbs_mpp.mp_code = pbs_mp.code') brand_names = {} for tpp in cur.fetchall(): words = tpp['brand_name'].split() words_lower = tpp['brand_name'].lower().split() # If any word of the generic name is in the brand name, skip it because it is uninteresting if any(w.lower() in tpp['brand_name'].lower() for w in tpp['mp_preferred_term'].split() if w != '+'): continue # Ignore anything that looks like a company name if 'pty' in words_lower or 'ltd' in words_lower or 'australia' in words_lower: continue # Strip all trailing words that look like a dose for i in reversed(range(len(words))): if LOOKS_LIKE_DOSE.fullmatch(words[i]): del words[i] else: break # OK! brand_name = ' '.join(words) if tpp['mp_code'] not in brand_names: brand_names[tpp['mp_code']] = set() brand_names[tpp['mp_code']].add(brand_name) # Reduce names with unambiguous prefixes for mp_code in sorted(brand_names.keys()): for brand_name in list(brand_names[mp_code]): # Can we reduce the length of the name? words = brand_name.split() for i in range(1, len(words)): short_name = ' '.join(words[0:i]) if any(b.startswith(short_name) for m in brand_names.keys() if m != mp_code for b in brand_names[m]): # Conflict continue # Can shorten if brand_name in brand_names[mp_code]: brand_names[mp_code].remove(brand_name) brand_names[mp_code].add(short_name) break # Add to database for mp_code in sorted(brand_names.keys()): for brand_name in sorted(list(brand_names[mp_code])): cur.execute('INSERT INTO pbs_mp_brand_name (mp_code, brand_name) VALUES (?, ?)', (mp_code, brand_name)) con.commit()