MedicineSearch/find_pbs_brand_names.py
2023-01-24 19:58:17 +11:00

86 lines
2.9 KiB
Python
Executable File

#!/usr/bin/env python3
# Copyright © 2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import re
import sqlite3
LOOKS_LIKE_DOSE = re.compile(r'([0-9/.,+%]*( ?(mg|mL|U|IU))?)+')
# Open database
con = sqlite3.connect('database.db')
con.row_factory = sqlite3.Row
cur = con.cursor()
# Init schema
cur.execute('DROP TABLE IF EXISTS pbs_mp_brand_name')
cur.execute('CREATE TABLE pbs_mp_brand_name (id INTEGER PRIMARY KEY AUTOINCREMENT, mp_code STRING, brand_name STRING)')
cur.execute('SELECT * FROM pbs_tpp LEFT JOIN (SELECT code, mp_code FROM pbs_mpp) AS pbs_mpp ON pbs_tpp.mpp_code = pbs_mpp.code LEFT JOIN (SELECT code, preferred_term as mp_preferred_term FROM pbs_mp) AS pbs_mp ON pbs_mpp.mp_code = pbs_mp.code')
brand_names = {}
for tpp in cur.fetchall():
words = tpp['brand_name'].split()
words_lower = tpp['brand_name'].lower().split()
# If any word of the generic name is in the brand name, skip it because it is uninteresting
if any(w.lower() in tpp['brand_name'].lower() for w in tpp['mp_preferred_term'].split() if w != '+'):
continue
# Ignore anything that looks like a company name
if 'pty' in words_lower or 'ltd' in words_lower or 'australia' in words_lower:
continue
# Strip all trailing words that look like a dose
for i in reversed(range(len(words))):
if LOOKS_LIKE_DOSE.fullmatch(words[i]):
del words[i]
else:
break
# OK!
brand_name = ' '.join(words)
if tpp['mp_code'] not in brand_names:
brand_names[tpp['mp_code']] = set()
brand_names[tpp['mp_code']].add(brand_name)
# Reduce names with unambiguous prefixes
for mp_code in sorted(brand_names.keys()):
for brand_name in list(brand_names[mp_code]):
# Can we reduce the length of the name?
words = brand_name.split()
for i in range(1, len(words)):
short_name = ' '.join(words[0:i])
if any(b.startswith(short_name) for m in brand_names.keys() if m != mp_code for b in brand_names[m]):
# Conflict
continue
# Can shorten
if brand_name in brand_names[mp_code]:
brand_names[mp_code].remove(brand_name)
brand_names[mp_code].add(short_name)
break
# Add to database
for mp_code in sorted(brand_names.keys()):
for brand_name in sorted(list(brand_names[mp_code])):
cur.execute('INSERT INTO pbs_mp_brand_name (mp_code, brand_name) VALUES (?, ?)', (mp_code, brand_name))
con.commit()