Miguel Alejandro Salgado Zapien 5f5e0fbd17 update.

2024-02-25 21:15:52 -08:00

229 KiB

Raw Permalink Blame History

from functools import cache
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

estados_mexicanos = {
    "AGUASCALIENTES",
    "BAJA CALIFORNIA",
    "BAJA CALIFORNIA SUR",
    "CAMPECHE",
    "CHIAPAS",
    "CHIHUAHUA",
    "COAHUILA DE ZARAGOZA",
    "COLIMA",
    "DISTRITO FEDERAL",
    "DURANGO",
    "GUANAJUATO",
    "GUERRERO",
    "HIDALGO",
    "JALISCO",
    "MEXICO",
    "MICHOACAN DE OCAMPO",
    "MORELOS",
    "NAYARIT",
    "NUEVO LEON",
    "OAXACA",
    "PUEBLA",
    "QUERETARO  DE ARTEAGA",
    "QUINTANA ROO",
    "SAN LUIS POTOSI",
    "SINALOA",
    "SONORA",
    "TABASCO",
    "TAMAULIPAS",
    "TLAXCALA",
    "VERACRUZ DE IGNACIO DE LA LLAVE",
    "YUCATAN",
    "ZACATECAS",
}

df = pd.read_csv("2010-2019.csv")

/var/folders/05/y38rqjl55hjb_hbnypxzgrsw0000gn/T/ipykernel_93495/3168623387.py:1: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.
  df = pd.read_csv("2010-2019.csv")

def _ano_nacimiento_vivo_func(str_date):
    try:
        return str_date.split("/")[-1]
    except:
        return ""


df["año_de_nacimiento_vivo"] = df["fecha_nacimiento_nac_vivo"].apply(
    _ano_nacimiento_vivo_func
)

df = df[(5 < df["edad_madre"]) & (df["edad_madre"] < 90)]

df_trisomias = df[df["codigo_anomalia"].apply(lambda x: "Q9" in str(x))]

consulta_trisomias = df_trisomias.groupby(["año_de_nacimiento_vivo"]).agg(
    {
        "edad_madre": [
            "count",
            "mean",
            "std",
            "min",
            "max",
            "median",
            lambda x: x.quantile(0.25),  # For Q1
            lambda x: x.quantile(0.75),  # For Q3
        ],
    }
)
consulta_trisomias = consulta_trisomias.rename(
    columns={
        "<lambda_0>": "Q1",
        "<lambda_1>": "Q3",
    }
)
consulta_trisomias

	edad_madre
	count	mean	std	min	max	median	Q1	Q3
año_de_nacimiento_vivo
2010	930	30.546237	8.244939	10	48	31.0	23.0	37.75
2011	1051	31.010466	8.193777	12	49	32.0	24.0	38.00
2012	961	30.462019	8.310565	13	47	31.0	23.0	38.00
2013	1055	31.182938	8.247919	11	51	32.0	24.0	38.00
2014	1031	31.018429	8.356304	13	50	32.0	24.0	38.00
2015	1016	31.500984	8.295052	14	52	32.0	24.0	39.00
2016	1044	31.453065	8.147413	14	47	32.0	24.0	39.00
2017	1043	31.410355	8.174581	13	47	33.0	24.0	38.50
2018	1059	31.064212	8.173198	13	48	32.0	24.0	38.00
2019	941	32.018066	8.195918	13	47	34.0	25.0	39.00

año_de_nacimiento_vivo
2010     930
2011    1051
2012     961
2013    1055
2014    1031
2015    1016
2016    1044
2017    1043
2018    1059
2019     941
Name: (edad_madre, count), dtype: int64

# Edades de madres
consulta_total = df.groupby(["año_de_nacimiento_vivo"]).agg(
    {
        "edad_madre": [
            "count",
            "mean",
            "std",
            "min",
            "max",
            "median",
            lambda x: x.quantile(0.25),  # For Q1
            lambda x: x.quantile(0.75),  # For Q3
        ],
    }
)
consulta_total = consulta_total.rename(
    columns={
        "<lambda_0>": "Q1",
        "<lambda_1>": "Q3",
    }
)
consulta_total

	edad_madre
	count	mean	std	min	max	median	Q1	Q3
año_de_nacimiento_vivo
2010	2063533	25.253220	6.319567	9	58	24.0	20.0	30.0
2011	2156751	25.234223	6.331894	9	58	24.0	20.0	30.0
2012	2197327	25.195768	6.321840	9	58	24.0	20.0	30.0
2013	2189257	25.198235	6.322081	9	59	24.0	20.0	30.0
2014	2173773	25.276009	6.322130	9	58	24.0	20.0	30.0
2015	2143345	25.367835	6.296604	9	59	25.0	20.0	30.0
2016	2079251	25.468008	6.292815	9	59	25.0	20.0	30.0
2017	2037647	25.510821	6.305873	9	62	25.0	21.0	30.0
2018	1940338	25.678051	6.328369	9	60	25.0	21.0	30.0
2019	1867693	25.840630	6.342544	9	58	25.0	21.0	30.0

consulta = consulta_total.join(
    consulta_trisomias, rsuffix="_trisomias", lsuffix="_general"
)

consulta["porcentaje"] = (
    consulta[("edad_madre_trisomias", "count")]
    / consulta[("edad_madre_general", "count")]
)

consulta

	edad_madre_general								edad_madre_trisomias								porcentaje
	count	mean	std	min	max	median	Q1	Q3	count	mean	std	min	max	median	Q1	Q3
año_de_nacimiento_vivo
2010	2063533	25.253220	6.319567	9	58	24.0	20.0	30.0	930	30.546237	8.244939	10	48	31.0	23.0	37.75	0.000451
2011	2156751	25.234223	6.331894	9	58	24.0	20.0	30.0	1051	31.010466	8.193777	12	49	32.0	24.0	38.00	0.000487
2012	2197327	25.195768	6.321840	9	58	24.0	20.0	30.0	961	30.462019	8.310565	13	47	31.0	23.0	38.00	0.000437
2013	2189257	25.198235	6.322081	9	59	24.0	20.0	30.0	1055	31.182938	8.247919	11	51	32.0	24.0	38.00	0.000482
2014	2173773	25.276009	6.322130	9	58	24.0	20.0	30.0	1031	31.018429	8.356304	13	50	32.0	24.0	38.00	0.000474
2015	2143345	25.367835	6.296604	9	59	25.0	20.0	30.0	1016	31.500984	8.295052	14	52	32.0	24.0	39.00	0.000474
2016	2079251	25.468008	6.292815	9	59	25.0	20.0	30.0	1044	31.453065	8.147413	14	47	32.0	24.0	39.00	0.000502
2017	2037647	25.510821	6.305873	9	62	25.0	21.0	30.0	1043	31.410355	8.174581	13	47	33.0	24.0	38.50	0.000512
2018	1940338	25.678051	6.328369	9	60	25.0	21.0	30.0	1059	31.064212	8.173198	13	48	32.0	24.0	38.00	0.000546
2019	1867693	25.840630	6.342544	9	58	25.0	21.0	30.0	941	32.018066	8.195918	13	47	34.0	25.0	39.00	0.000504

Pendiente

Generar gráfica de cajas con edades de las madres con hijos de trisomias.

https://stackoverflow.com/a/66565512

stats = []
for i, x in consulta_trisomias.iterrows():
    stat = dict(
        label=i,
        mean=x[("edad_madre", "mean")],
        count=x[("edad_madre", "count")],
        std=x[("edad_madre", "std")],
        whislo=x[("edad_madre", "min")],
        whishi=x[("edad_madre", "max")],
        med=x[("edad_madre", "median")],
        q1=x[("edad_madre", "Q1")],
        q3=x[("edad_madre", "Q3")],
    )
    stats.append(stat)

consulta_trisomias.describe()

	edad_madre
	count	mean	std	min	max	median	Q1	Q3
count	10.000000	10.000000	10.000000	10.000000	10.000000	10.000000	10.000000	10.000000
mean	1013.100000	31.166677	8.233967	12.600000	48.600000	32.100000	23.900000	38.325000
std	49.771589	0.461478	0.068843	1.264911	1.837873	0.875595	0.567646	0.500694
min	930.000000	30.462019	8.147413	10.000000	47.000000	31.000000	23.000000	37.750000
25%	974.750000	31.012457	8.179380	12.250000	47.000000	32.000000	24.000000	38.000000
50%	1037.000000	31.123575	8.220429	13.000000	48.000000	32.000000	24.000000	38.000000
75%	1049.250000	31.442388	8.283268	13.000000	49.750000	32.000000	24.000000	38.875000
max	1059.000000	32.018066	8.356304	14.000000	52.000000	34.000000	25.000000	39.000000

%matplotlib notebook
from matplotlib.figure import Figure
from matplotlib.ticker import AutoMinorLocator

fig = Figure()
ax = fig.add_subplot()
ax.bxp(
    stats,
    showfliers=False,
    showmeans=True,
)
ax.set_ylabel("Edad de la Madre")
ax.set_xlabel("Año de Registro")
ax.set_title("Distribuciones de Edad de las Madres")
ax.yaxis.set_minor_locator(AutoMinorLocator())

ax.grid(visible=True, which="both", axis="y", linewidth=1, alpha=0.2)
fig

def _anomalias_filtradas(anomalia):
    splitted = [x for x in anomalia.split(",") if len(x) == 4 and x[:2] == "Q9"]
    if len(splitted) < 2:
        return ",".join(splitted)
    if splitted[0] == splitted[1]:
        return splitted[0]
    return ",".join(splitted)


def _clasificador(codigos):

    if "Q910" in codigos or "Q911" in codigos or "Q912" in codigos or "Q913" in codigos:
        return "Edwards"
    if "Q914" in codigos or "Q915" in codigos or "Q916" in codigos or "Q917" in codigos:
        return "Patau"
    if "Q90" in codigos:
        return "Down"
    return "Otra"


df_trisomias["Trisomía"] = df_trisomias.codigo_anomalia.apply(
    _anomalias_filtradas
).apply(_clasificador)
df_trisomias["Síndrome de Down"] = df_trisomias["Trisomía"] == "Down"
df_trisomias["Síndrome de Edwards"] = df_trisomias["Trisomía"] == "Edwards"
df_trisomias["Síndrome de Patau"] = df_trisomias["Trisomía"] == "Patau"
df_trisomias["Otro Síndrome"] = df_trisomias["Trisomía"] == "Otra"

/var/folders/05/y38rqjl55hjb_hbnypxzgrsw0000gn/T/ipykernel_93495/1078194369.py:30: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trisomias["Trisomía"] = df_trisomias.codigo_anomalia.apply(
/var/folders/05/y38rqjl55hjb_hbnypxzgrsw0000gn/T/ipykernel_93495/1078194369.py:33: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trisomias["Síndrome de Down"] = (df_trisomias["Trisomía"] == "Down")
/var/folders/05/y38rqjl55hjb_hbnypxzgrsw0000gn/T/ipykernel_93495/1078194369.py:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trisomias["Síndrome de Edwards"] = (df_trisomias["Trisomía"] == "Edwards")
/var/folders/05/y38rqjl55hjb_hbnypxzgrsw0000gn/T/ipykernel_93495/1078194369.py:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trisomias["Síndrome de Patau"] = (df_trisomias["Trisomía"] == "Patau")
/var/folders/05/y38rqjl55hjb_hbnypxzgrsw0000gn/T/ipykernel_93495/1078194369.py:36: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trisomias["Otro Síndrome"] =  (df_trisomias["Trisomía"] == "Otra")

df_trisomias.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'edo_captura', 'edo_nac_madre',
       'fecha_nac_madre', 'edad_madre', 'estado_conyugal',
       'entidad_residencia_madre', 'numero_embarazos', 'hijos_nacidos_muertos',
       'hijos_nacidos_vivos', 'hijos_sobrevivientes', 'el_hijo_anterior_nacio',
       'vive_aun_hijo_anterior', 'orden_nacimiento',
       'recibio_atencion_prenatal', 'trimestre_recibio_primera_consulta',
       'total_consultas_recibidas', 'madre_sobrevivio_al_parto',
       'escolaridad_madre', 'ocupacion_habitual_madre', 'trabaja_actualmente',
       'fecha_nacimiento_nac_vivo', 'hora_nacimiento_nac_vivo',
       'sexo_nac_vivo', 'semanas_gestacion_nac_vivo', 'talla_nac_vivo',
       'peso_nac_vivo', 'valoracion_apgar_nac_vivo',
       'valoracion_silverman_nac_vivo', 'producto_de_un_embarazo',
       'codigo_anomalia', 'entidad_certifico', 'año_de_nacimiento_vivo',
       'Trisomia', 'Sindrome de Down', 'Sindrome de Edwards',
       'Sindrome de Patau', 'Otro Sindrome', 'Trisomía', 'Síndrome de Down',
       'Síndrome de Edwards', 'Síndrome de Patau', 'Otro Síndrome'],
      dtype='object')

import matplotlib.ticker as mtick

fig = Figure()
ax = fig.add_subplot()
_to_plot = df_trisomias.groupby("año_de_nacimiento_vivo").agg(
    {
        "Síndrome de Down": ["sum"],
        "Síndrome de Edwards": ["sum"],
        "Síndrome de Patau": ["sum"],
        "Otro Síndrome": ["sum"],
    }
)
_index = _to_plot.index.to_list()
_records = _to_plot.to_dict("records")
_labels = [x[0] for x in _records[0].keys()]
_data = np.array([[*x.values()] for x in _records], dtype="float64")
_totals = (_data @ np.ones(_data.shape[1])) / 100
bottom = _data[:, 0] * 0
for i, label in enumerate(_labels):
    data = _data[:, i] / _totals
    ax.bar(_index, data, 0.7, label=label, bottom=bottom)
    bottom += data
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_title("Distribución de trisomias y otros síndromes")
ax.legend()
fig

_to_plot = df_trisomias.groupby("año_de_nacimiento_vivo").agg(
    {
        "Síndrome de Down": ["sum"],
        "Síndrome de Edwards": ["sum"],
        "Síndrome de Patau": ["sum"],
        "Otro Síndrome": ["sum"],
    }
)
_to_plot.columns = ["Down", "Edwards", "Patau", "Otros"]
_to_plot["Total"] = _to_plot.apply(sum, axis=1)
for col in ["Down", "Edwards", "Patau", "Otros"]:
    _to_plot[col] = _to_plot[col] / _to_plot["Total"]
_to_plot.describe()

	Down	Edwards	Patau	Otros	Total
count	10.000000	10.000000	10.000000	10.000000	10.000000
mean	0.846123	0.020646	0.007997	0.125234	1013.100000
std	0.040421	0.006969	0.003110	0.038154	49.771589
min	0.797699	0.010627	0.004162	0.065156	930.000000
25%	0.813188	0.016582	0.005697	0.105395	974.750000
50%	0.846609	0.019526	0.007144	0.126721	1037.000000
75%	0.861515	0.023294	0.010124	0.154212	1049.250000
max	0.912181	0.036433	0.012752	0.180645	1059.000000

fig = Figure()
ax = fig.add_subplot()
_x = [int(x) for x in (consulta.index.to_list())]
_z = consulta[("edad_madre_trisomias", "count")].to_list()
_y = (consulta.porcentaje * 100).to_list()
ax.plot(_x, _y, "--")
ax.scatter(_x, _y, label="Porcentaje y cantidad de nacidos con malformación")

ax.annotate(f"{int(_z[0])}", (_x[0] - 0.075, _y[0] - 0.001))
ax.annotate(f"{int(_z[1])}", (_x[1] - 0.075, _y[1] + 0.0005))
ax.annotate(f"{int(_z[2])}", (_x[2] - 0.075, _y[2] - 0.001))
ax.annotate(f"{int(_z[3])}", (_x[3] - 0.075, _y[3] + 0.0005))
ax.annotate(f"{int(_z[4])}", (_x[4] - 0.075, _y[4] - 0.001))
ax.annotate(f"{int(_z[5])}", (_x[5] - 0.075, _y[5] - 0.001))
ax.annotate(f"{int(_z[6])}", (_x[6] - 0.075, _y[6] - 0.001))
ax.annotate(f"{int(_z[7])}", (_x[7] - 0.075, _y[7] - 0.001))
ax.annotate(f"{int(_z[8])}", (_x[8] + 0.2, _y[8] - 0.00025))
ax.annotate(f"{int(_z[9])}", (_x[9] - 0.075, _y[9] - 0.001))

ax.set_ylim(0.04, 0.055)
ax.xaxis.set_ticks(_x)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_title("Distribución de nacimiento con malformación")
ax.set_xlabel("Año")
ax.set_ylabel("Porcentaje")
ax.yaxis.set_minor_locator(AutoMinorLocator())
ax.grid(alpha=0.1, which="both")
ax.legend()
fig

consulta.porcentaje.describe()

count    10.000000
mean      0.000487
std       0.000031
min       0.000437
25%       0.000474
50%       0.000485
75%       0.000503
max       0.000546
Name: porcentaje, dtype: float64

stats = []
for i, x in consulta_total.iterrows():
    stat = dict(
        label=i,
        mean=x[("edad_madre", "mean")],
        count=x[("edad_madre", "count")],
        std=x[("edad_madre", "std")],
        whislo=x[("edad_madre", "min")],
        whishi=x[("edad_madre", "max")],
        med=x[("edad_madre", "median")],
        q1=x[("edad_madre", "Q1")],
        q3=x[("edad_madre", "Q3")],
    )
    stats.append(stat)

%matplotlib notebook
from matplotlib.figure import Figure
from matplotlib.ticker import AutoMinorLocator

fig = Figure()
ax = fig.add_subplot()
ax.bxp(
    stats,
    showfliers=False,
    showmeans=True,
)
ax.set_ylabel("Edad de la Madre")
ax.set_xlabel("Año de Registro")
ax.set_title("Distribuciones Poblacionales de Edad de las Madres")
ax.yaxis.set_minor_locator(AutoMinorLocator())

ax.grid(visible=True, which="both", axis="y", linewidth=1, alpha=0.2)
fig

consulta_total.describe()

	edad_madre
	count	mean	std	min	max	median	Q1	Q3
count	1.000000e+01	10.000000	10.000000	10.0	10.000000	10.000000	10.000000	10.0
mean	2.084892e+06	25.402280	6.318372	9.0	58.900000	24.500000	20.300000	30.0
std	1.109581e+05	0.220145	0.015605	0.0	1.286684	0.527046	0.483046	0.0
min	1.867693e+06	25.195768	6.292815	9.0	58.000000	24.000000	20.000000	30.0
25%	2.044118e+06	25.238972	6.309296	9.0	58.000000	24.000000	20.000000	30.0
50%	2.111298e+06	25.321922	6.321961	9.0	58.500000	24.500000	20.000000	30.0
75%	2.169518e+06	25.500118	6.326809	9.0	59.000000	25.000000	20.750000	30.0
max	2.197327e+06	25.840630	6.342544	9.0	62.000000	25.000000	21.000000	30.0

229 KiB Raw Permalink Blame History

Pendiente

229 KiB

Raw Permalink Blame History