MedicineSearch/find_brand_names.py

108 lines
3.7 KiB
Python
Executable File

#!/usr/bin/env python3
# Copyright © 2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import re
import sqlite3
LOOKS_LIKE_DOSE = re.compile(r'([0-9/.,+%]*( ?(mg|mL|U|IU))?)+')
# Open database
con = sqlite3.connect('database.db')
con.row_factory = sqlite3.Row
cur = con.cursor()
# Init schema
cur.execute('DROP TABLE IF EXISTS mp_brand_name')
cur.execute('CREATE TABLE mp_brand_name (id INTEGER PRIMARY KEY AUTOINCREMENT, mp_preferred_term TEXT, brand_name TEXT)')
# Get PBS brand names
cur.execute('SELECT brand_name, mp_preferred_term FROM pbs_tpp LEFT JOIN (SELECT code, mp_code FROM pbs_mpp) AS pbs_mpp ON pbs_tpp.mpp_code = pbs_mpp.code LEFT JOIN (SELECT code, preferred_term as mp_preferred_term FROM pbs_mp) AS pbs_mp ON pbs_mpp.mp_code = pbs_mp.code')
brand_names = {}
for tpp in cur.fetchall():
words = tpp['brand_name'].split()
words_lower = tpp['brand_name'].lower().split()
# If any word of the generic name is in the brand name, skip it because it is uninteresting
if any(w.lower() in tpp['brand_name'].lower() for w in tpp['mp_preferred_term'].split() if w != '+'):
continue
# Ignore anything that looks like a company name
if 'pty' in words_lower or 'ltd' in words_lower or 'australia' in words_lower:
continue
# Strip all trailing words that look like a dose
for i in reversed(range(len(words))):
if LOOKS_LIKE_DOSE.fullmatch(words[i]):
del words[i]
else:
break
# OK!
brand_name = ' '.join(words)
if tpp['mp_preferred_term'] not in brand_names:
brand_names[tpp['mp_preferred_term']] = set()
brand_names[tpp['mp_preferred_term']].add(brand_name)
# Get non-PBS brand names
cur.execute('SELECT * FROM non_pbs_tpp')
for tpp in cur.fetchall():
# This is manually curated so no need for cleaning
if tpp['mp_preferred_term'] not in brand_names:
brand_names[tpp['mp_preferred_term']] = set()
brand_names[tpp['mp_preferred_term']].add(tpp['brand_name'])
# Reduce names with unambiguous prefixes
for mp_preferred_term in sorted(brand_names.keys()):
for brand_name in list(brand_names[mp_preferred_term]):
# Can we reduce the length of the name?
words = brand_name.split()
for i in range(1, len(words)):
short_name = ' '.join(words[0:i])
# Conflict if ambiguous brand name prefix
if any(b.startswith(short_name) for m in brand_names.keys() if m != mp_preferred_term for b in brand_names[m]):
continue
# Exceptions
if short_name == 'Coloxyl with':
continue # Looks strange to cut off here
if short_name == 'Magnesium':
continue # Disambiguates Magnesium Forte
# Can shorten
if brand_name in brand_names[mp_preferred_term]:
brand_names[mp_preferred_term].remove(brand_name)
# Don't list brand name if it is same as generic
if short_name.lower() != mp_preferred_term.lower():
brand_names[mp_preferred_term].add(short_name)
break
# Add to database
for mp_preferred_term in sorted(brand_names.keys()):
for brand_name in sorted(list(brand_names[mp_preferred_term])):
cur.execute('INSERT INTO mp_brand_name (mp_preferred_term, brand_name) VALUES (?, ?)', (mp_preferred_term, brand_name))
con.commit()