MedicineSearch/find_brand_names.py

#!/usr/bin/env python3
#   Copyright © 2023  Lee Yingtong Li (RunasSudo)
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

import re
import sqlite3

LOOKS_LIKE_DOSE = re.compile(r'([0-9/.,+%]*( ?(mg|mL|U|IU))?)+')

# Open database
con = sqlite3.connect('database.db')
con.row_factory = sqlite3.Row
cur = con.cursor()

# Init schema
cur.execute('DROP TABLE IF EXISTS mp_brand_name')
cur.execute('CREATE TABLE mp_brand_name (id INTEGER PRIMARY KEY AUTOINCREMENT, mp_preferred_term TEXT, brand_name TEXT)')

# Get PBS brand names
cur.execute('SELECT brand_name, mp_preferred_term FROM pbs_tpp LEFT JOIN (SELECT code, mp_code FROM pbs_mpp) AS pbs_mpp ON pbs_tpp.mpp_code = pbs_mpp.code LEFT JOIN (SELECT code, preferred_term as mp_preferred_term FROM pbs_mp) AS pbs_mp ON pbs_mpp.mp_code = pbs_mp.code')

brand_names = {}
for tpp in cur.fetchall():
	words = tpp['brand_name'].split()
	words_lower = tpp['brand_name'].lower().split()
	
	# If any word of the generic name is in the brand name, skip it because it is uninteresting
	if any(w.lower() in tpp['brand_name'].lower() for w in tpp['mp_preferred_term'].split() if w != '+'):
		continue
	
	# Ignore anything that looks like a company name
	if 'pty' in words_lower or 'ltd' in words_lower or 'australia' in words_lower:
		continue
	
	# Strip all trailing words that look like a dose
	for i in reversed(range(len(words))):
		if LOOKS_LIKE_DOSE.fullmatch(words[i]):
			del words[i]
		else:
			break
	
	# OK!
	brand_name = ' '.join(words)
	
	if tpp['mp_preferred_term'] not in brand_names:
		brand_names[tpp['mp_preferred_term']] = set()
	
	brand_names[tpp['mp_preferred_term']].add(brand_name)

# Get non-PBS brand names
cur.execute('SELECT * FROM non_pbs_tpp')
for tpp in cur.fetchall():
	# This is manually curated so no need for cleaning
	
	if tpp['mp_preferred_term'] not in brand_names:
		brand_names[tpp['mp_preferred_term']] = set()
	
	brand_names[tpp['mp_preferred_term']].add(tpp['brand_name'])

# Reduce names with unambiguous prefixes
for mp_preferred_term in sorted(brand_names.keys()):
	for brand_name in list(brand_names[mp_preferred_term]):
		# Can we reduce the length of the name?
		words = brand_name.split()
		
		for i in range(1, len(words)):
			short_name = ' '.join(words[0:i])
			
			if any(b.startswith(short_name) for m in brand_names.keys() if m != mp_preferred_term for b in brand_names[m]):
				# Conflict
				continue
			
			# Exceptions
			if short_name == 'Coloxyl with':
				continue
			
			# Can shorten
			if brand_name in brand_names[mp_preferred_term]:
				brand_names[mp_preferred_term].remove(brand_name)
			brand_names[mp_preferred_term].add(short_name)
			break

# Add to database
for mp_preferred_term in sorted(brand_names.keys()):
	for brand_name in sorted(list(brand_names[mp_preferred_term])):
		cur.execute('INSERT INTO mp_brand_name (mp_preferred_term, brand_name) VALUES (?, ?)', (mp_preferred_term, brand_name))

con.commit()
Add DB build script 2023-01-24 19:58:17 +11:00			`#!/usr/bin/env python3`
Allow searching by trade name 2023-01-24 19:56:54 +11:00			`# Copyright © 2023 Lee Yingtong Li (RunasSudo)`
			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU Affero General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU Affero General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU Affero General Public License`
			`# along with this program. If not, see <https://www.gnu.org/licenses/>.`

			`import re`
			`import sqlite3`

			`LOOKS_LIKE_DOSE = re.compile(r'([0-9/.,+%]*( ?(mg\|mL\|U\|IU))?)+')`

			`# Open database`
			`con = sqlite3.connect('database.db')`
			`con.row_factory = sqlite3.Row`
			`cur = con.cursor()`

			`# Init schema`
Add selected non-PBS medicines 2023-02-04 15:48:44 +11:00			`cur.execute('DROP TABLE IF EXISTS mp_brand_name')`
			`cur.execute('CREATE TABLE mp_brand_name (id INTEGER PRIMARY KEY AUTOINCREMENT, mp_preferred_term TEXT, brand_name TEXT)')`
Allow searching by trade name 2023-01-24 19:56:54 +11:00
Add selected non-PBS medicines 2023-02-04 15:48:44 +11:00			`# Get PBS brand names`
			`cur.execute('SELECT brand_name, mp_preferred_term FROM pbs_tpp LEFT JOIN (SELECT code, mp_code FROM pbs_mpp) AS pbs_mpp ON pbs_tpp.mpp_code = pbs_mpp.code LEFT JOIN (SELECT code, preferred_term as mp_preferred_term FROM pbs_mp) AS pbs_mp ON pbs_mpp.mp_code = pbs_mp.code')`
Allow searching by trade name 2023-01-24 19:56:54 +11:00
			`brand_names = {}`
			`for tpp in cur.fetchall():`
			`words = tpp['brand_name'].split()`
			`words_lower = tpp['brand_name'].lower().split()`

			`# If any word of the generic name is in the brand name, skip it because it is uninteresting`
			`if any(w.lower() in tpp['brand_name'].lower() for w in tpp['mp_preferred_term'].split() if w != '+'):`
			`continue`

			`# Ignore anything that looks like a company name`
			`if 'pty' in words_lower or 'ltd' in words_lower or 'australia' in words_lower:`
			`continue`

			`# Strip all trailing words that look like a dose`
			`for i in reversed(range(len(words))):`
			`if LOOKS_LIKE_DOSE.fullmatch(words[i]):`
			`del words[i]`
			`else:`
			`break`

			`# OK!`
			`brand_name = ' '.join(words)`

Add selected non-PBS medicines 2023-02-04 15:48:44 +11:00			`if tpp['mp_preferred_term'] not in brand_names:`
			`brand_names[tpp['mp_preferred_term']] = set()`
Allow searching by trade name 2023-01-24 19:56:54 +11:00
Add selected non-PBS medicines 2023-02-04 15:48:44 +11:00			`brand_names[tpp['mp_preferred_term']].add(brand_name)`

			`# Get non-PBS brand names`
			`cur.execute('SELECT * FROM non_pbs_tpp')`
			`for tpp in cur.fetchall():`
			`# This is manually curated so no need for cleaning`

			`if tpp['mp_preferred_term'] not in brand_names:`
			`brand_names[tpp['mp_preferred_term']] = set()`

			`brand_names[tpp['mp_preferred_term']].add(tpp['brand_name'])`
Allow searching by trade name 2023-01-24 19:56:54 +11:00
			`# Reduce names with unambiguous prefixes`
Add selected non-PBS medicines 2023-02-04 15:48:44 +11:00			`for mp_preferred_term in sorted(brand_names.keys()):`
			`for brand_name in list(brand_names[mp_preferred_term]):`
Allow searching by trade name 2023-01-24 19:56:54 +11:00			`# Can we reduce the length of the name?`
			`words = brand_name.split()`

			`for i in range(1, len(words)):`
			`short_name = ' '.join(words[0:i])`

Add selected non-PBS medicines 2023-02-04 15:48:44 +11:00			`if any(b.startswith(short_name) for m in brand_names.keys() if m != mp_preferred_term for b in brand_names[m]):`
Allow searching by trade name 2023-01-24 19:56:54 +11:00			`# Conflict`
			`continue`

Include RPBS items 2023-02-04 14:09:58 +11:00			`# Exceptions`
			`if short_name == 'Coloxyl with':`
			`continue`

Allow searching by trade name 2023-01-24 19:56:54 +11:00			`# Can shorten`
Add selected non-PBS medicines 2023-02-04 15:48:44 +11:00			`if brand_name in brand_names[mp_preferred_term]:`
			`brand_names[mp_preferred_term].remove(brand_name)`
			`brand_names[mp_preferred_term].add(short_name)`
Allow searching by trade name 2023-01-24 19:56:54 +11:00			`break`

			`# Add to database`
Add selected non-PBS medicines 2023-02-04 15:48:44 +11:00			`for mp_preferred_term in sorted(brand_names.keys()):`
			`for brand_name in sorted(list(brand_names[mp_preferred_term])):`
			`cur.execute('INSERT INTO mp_brand_name (mp_preferred_term, brand_name) VALUES (?, ?)', (mp_preferred_term, brand_name))`
Allow searching by trade name 2023-01-24 19:56:54 +11:00
			`con.commit()`