Source code for nycparser.nycparser

import re
from typing import Dict, Union, List, Optional, Any


[docs]class Parser:
[docs] def __init__(self): self.borough_dict = { "MANHATTAN": 1, "MN": 1, "NEW YORK": 1, "BRONX": 2, "THE BRONX": 2, "BX": 2, "BROOKLYN": 3, "BK": 3, "BKLYN": 3, "KINGS": 3, "QUEENS": 4, "QN": 4, "QU": 4, "STATEN ISLAND": 5, "SI": 5, "STATEN IS": 5, "RICHMOND": 5, } self.borough_dict_reverse = { 1: "MANHATTAN", 2: "BRONX", 3: "BROOKLYN", 4: "QUEENS", 5: "STATEN ISLAND", } # Additional terms to filter out from street names self.filter_terms = ["NEW YORK", "NY", "USA", "UNITED STATES", "AMERICA"] # Common street descriptors to preserve self.descriptors = ["REAR", "EAST", "WEST", "NORTH", "SOUTH", "FRONT"]
[docs] def address(self, address: str) -> Dict[str, Any]: """ Parses a single line input address. Args: address: A single line input address with PHN and Street, ex. "100 Gold St." Returns: A dictionary with PHN, STREET, BOROUGH_CODE, BOROUGH_NAME, ZIP Examples: >>> parser = Parser() >>> parser.address("100 Gold St, Manhattan, NY 10038") {'PHN': '100', 'STREET': 'GOLD ST', 'BOROUGH_CODE': 1, 'BOROUGH_NAME': 'MANHATTAN', 'ZIP': '10038'} """ if not address or not isinstance(address, str): raise ValueError("Address must be a non-empty string") # Normalize input address = address.strip().upper() result = { "PHN": None, "STREET": "", "BOROUGH_CODE": None, "BOROUGH_NAME": None, "ZIP": None, } # Special case: single word input is treated as PHN if len(address.split()) == 1 and address.strip().replace("-", "").isdigit(): result["PHN"] = address return result # Extract house number - handles both standard and Queens-style hyphenated house numbers house_match = re.match(r"^(\d+(?:-\d+)?)\s+(.+)$", address) if house_match: result["PHN"] = house_match.group(1) unparsed = house_match.group(2) else: # Handle addresses without house numbers unparsed = address # Extract ZIP code zip_match = re.search(r"\b(\d{5})(?:-\d{4})?\b", address) if zip_match: result["ZIP"] = zip_match.group(1) # Remove zip from unparsed unparsed = re.sub(r"\b\d{5}(?:-\d{4})?\b", "", unparsed) # Extract borough using position-based approach street = unparsed matched_borough = False borough_matches = [] # Find all potential borough matches with their positions for borough_name, borough_code in self.borough_dict.items(): match = re.search(r"\b{}\b".format(re.escape(borough_name)), unparsed) if match: borough_matches.append((match.start(), borough_name, borough_code)) # Sort by position in the string (earlier matches first) borough_matches.sort() if borough_matches: # Use the first occurrence _, borough_name, borough_code = borough_matches[0] result["BOROUGH_CODE"] = borough_code result["BOROUGH_NAME"] = self.borough_dict_reverse[borough_code] matched_borough = True # Now that we found the borough, remove ALL borough names from the street string # This ensures names like "Queens" are removed regardless of match if matched_borough: for borough_name in self.borough_dict.keys(): street = re.sub(r"\b{}\b".format(re.escape(borough_name)), "", street) # Remove apartment/unit information apt_patterns = [ r",\s*(?:APT|APARTMENT|UNIT|SUITE|#)[^\d]*[\w-]+", r"(?:APT|APARTMENT|UNIT|SUITE|#)[^\d]*[\w-]+", ] for pattern in apt_patterns: street = re.sub(pattern, "", street, flags=re.IGNORECASE) # Remove filter terms like state, country, etc. for term in self.filter_terms: street = re.sub(r"\b{}\b".format(re.escape(term)), "", street) # Clean up remaining punctuation and extra spaces street = re.sub(r"[,\-\.\(\)]+", " ", street) street = re.sub(r"\s+", " ", street).strip() # Preserve descriptors in the correct position result["STREET"] = street return result
[docs] def bbl(self, bbl: Union[str, int]) -> Dict[str, Any]: """ Parses a single line input BBL (Borough-Block-Lot). Args: bbl: A single line input bbl. Can contain special characters, just needs 10 digits. Returns: A dictionary with BOROUGH_CODE, BLOCK, LOT and BOROUGH_NAME Raises: ValueError: If the BBL doesn't contain exactly 10 digits or has an invalid borough code. Examples: >>> parser = Parser() >>> parser.bbl("1-01234-0001") {'BOROUGH_CODE': 1, 'BLOCK': 1234, 'LOT': 1, 'BOROUGH_NAME': 'MANHATTAN'} """ result = { "BOROUGH_CODE": None, "BLOCK": None, "LOT": None, "BOROUGH_NAME": None, } # Remove any special characters tmp = "".join(e for e in str(bbl) if e.isdigit()) if len(tmp) != 10: raise ValueError(f"{bbl} is not a 10 digit BBL.") borough_code = int(tmp[0]) if borough_code < 1 or borough_code > 5: raise ValueError(f"Invalid borough code: {borough_code}") result["BOROUGH_CODE"] = borough_code result["BLOCK"] = int(tmp[1:6]) result["LOT"] = int(tmp[6:10]) result["BOROUGH_NAME"] = self.borough_dict_reverse[result["BOROUGH_CODE"]] return result