Source code for element_extract

#!/usr/bin/env python
#"""Writen by Niraj K. Nepal, Ph.D."""
"""Module to extract data from materials project database"""
import os
import glob
import warnings
import pandas as pd
from mp_api.client import MPRester
from htepc import MpConnect
from check_json import config
warnings.filterwarnings('ignore')
# Make sure that you have the Materials API key.
# MPRester if needed, e.g, MPRester("API_KEY")
#get API_KEY from material projects, go to dashboard and generate the key.


[docs]
def create_folder(parent_folder):
    """
    Function to create a 'download' folder
    parameters
    --------------
    parent_folder : path to current working directory
    """
    if not os.path.isdir(parent_folder+"/download"):
        os.mkdir(parent_folder+"/download")



[docs]
def download(elm,num_el,exclude_el,properties):
    """
    Extracts various properties for compounds that satisfy certain criteria from the Materials Project database.

    Parameters:
    -----------
    elm : str or list of str
        Element(s) always to include in the compounds. For example, for hydrogen, elm = 'H'.
        If multiple elements are desired, provide a list with up to size 2. For example, elm = ['B', 'C'] for boron and carbon.

    num_el : int
        Number of elements in the compound.

    exclude_el : list of str
        List of elements to exclude from the compound.

    properties : list of str
        List of properties to extract.

    Returns:
    --------
    data : pandas DataFrame
        DataFrame containing the extracted data.
    Example:
    --------
    >>> download('H', 2, ['O', 'F'], ['material_id', 'formation_energy_per_atom'])
    """
    parent_folder=os.getcwd()
    create_folder(parent_folder)
    # Check for the presence of the API key file
    if os.path.isfile("config.json") or os.path.isfile("../../config.json"):
        key = input_data["mpi_key"]["API_KEY"]
    else:
        print("config.json file not found. Please provide with your materials project api key\n")
    # Initialize MPRester with API key
    mpr = MPRester(key["key"])
    # Search for materials matching specified criteria
    mpr_search = mpr.materials.summary.search(elements=[elm],
                                     exclude_elements=exclude_el,
                                     fields=properties,
                                     num_elements=num_el)
    # Write extracted data to CSV file
    with open(parent_folder+"/download/"+ "data-"+ elm +".csv", "w") as data_elm:
        for i,propti in enumerate(properties):
            if propti == "structure":
                propty = "spacegroup"
                data_elm.write(propty + ",")
            else:
                propty = propti
                data_elm.write(propty + ",")
            #if i < len(properties) - 1:
            #else:
            #    data_elm.write(propty)
        data_elm.write("composition\n")
        for search in mpr_search:
            property_list = []
            for propty in properties:
                if propty == "structure":
                    property_list.append(search.structure.get_space_group_info()[0])
                else:
                    property_list.append(search.dict()[propty])
            for j,prop in enumerate(property_list):
                #data_elm.write(str(prop) + ",")
                if j < len(property_list) - 1:
                    data_elm.write(str(prop) + ",")
                else:
                    data_elm.write(str(prop) + ",")
            data_elm.write(str(search.structure.composition.formula.replace(" ", "")))
            data_elm.write("\n")
    # Read the CSV file into a DataFrame
    data=pd.read_csv(parent_folder+"/download/"+"data-" + elm + ".csv")
    print(data['material_id'])
    return data



[docs]
def stable(data):
    """
    Filters the compounds for those having negative formation energy.

    Parameters:
    -----------
    data : pandas DataFrame
        DataFrame containing information about compounds, including formation energy per atom.

    Returns:
    --------
    data : pandas DataFrame
        DataFrame containing compounds with negative formation energy per atom.
    """
    stable_filter = data["formation_energy_per_atom"] < 0
    data = data[stable_filter]
    data = data.reset_index(drop=True)
    return data



[docs]
def convexhull(data):
    """
    Filters the compounds close to the convex hull.

    Parameters:
    -----------
    data : pandas DataFrame
        DataFrame containing information about compounds, including energy above hull.

    Returns:
    --------
    data : pandas DataFrame
        DataFrame containing compounds close to the convex hull.
    """
    stable_fil = data["energy_above_hull"] < 0.001
    data = data[stable_fil].reset_index(drop=True)
    return data



[docs]
def metal_filter(data):
    """
    Filters metallic compounds from the input DataFrame.

    Parameters:
    -----------
    data : pandas DataFrame
        DataFrame containing information about compounds, including band_gap.

    Returns:
    --------
    data : pandas DataFrame
        DataFrame containing metallic compounds (band gap <= 0.00001).
    """
    zero_band_gap = data['band_gap'] <= 0.00001
    data = data[zero_band_gap]
    data = data.reset_index(drop=True)
    return data


[docs]
def data_combine(data1,data2):
    """
    Combines two pandas DataFrames into a single DataFrame.

    Parameters:
    -----------
    data1 : pandas DataFrame
        The first DataFrame to be combined.
    data2 : pandas DataFrame
        The second DataFrame to be combined.

    Returns:
    --------
    data : pandas DataFrame
        Combined DataFrame containing data from both data1 and data2.
    """
    data = pd.merge(data1, data2, how='outer')
    data = data.reset_index(drop=True)
    return data

#def data_2_prefix(data):
#    prefix = []
#    for j in range(data.shape[0]):
#        s = ""
#        elm = list(data['composition'][j].keys())
#        count = list(data['composition'][j].values())
#        for i in range(len(elm)):
#            s += elm[i]+str(int(count[i]))
#        prefix.append(s)
#    data.drop(columns=['composition'],axis=1)
#    data['composition'] = prefix
#    return data

[docs]
def remove(data,element_list):
    """
    Removes compounds containing specified elements from the DataFrame.

    Parameters:
    -----------
    data : pandas DataFrame
        The DataFrame containing compounds to be filtered.
    element_list : str, optional
        File with elements to exclude. Default is 'remove.list'.
        The file should contain elements separated by commas.
        For example, to remove oxygen and nitrogen, write 'O,N' in 'remove.list'.

    Returns:
    --------
    data : pandas DataFrame
        Processed DataFrame with compounds containing specified elements removed.
    """
    if not os.path.isfile(element_list):
        os.system("""echo "NA" > remove.list""")
    with open(element_list, "r") as read_remove:
        lines = read_remove.readlines()
    remove_elements = lines[0].replace("\n", "").split(',')
    pattern_remove = '|'.join(remove_elements)
    print(pattern_remove)
    filter_remove = data["formula_pretty"].str.contains(pattern_remove)
    filter_temp = []
    for rem in filter_remove:
        filter_temp.append(not rem)
    data = data[filter_temp].reset_index(drop=True)
    #data.to_csv(filename)
    return data



[docs]
def data_one_element_compound(elm,ntype,exclude_el,properties):
    """
    Extracts information for compounds containing only one element.

    Parameters:
    -----------
    elm : str
        Element to search for in compounds. For example, 'B' for boron.
    ntype : int or tuple
        Number of unique elements. Can be a single integer or a tuple (e.g., (1, 2) for 2 different types).
    exclude_el : list
        List of elements to exclude. For example, ['O', 'N'].
    properties : list
        List of properties to extract.

    Returns:
    --------
    data : pandas DataFrame
        DataFrame containing information for compounds with only one element.
    """
    data = download(elm,ntype,exclude_el,properties)
    #data = download(el,ntype,properties)
    #data = stable(data)
    #data = metal_filter(data)
    #data = remove(data,'remove.list')
    return data



[docs]
def data_two_element_compound(el1,el2,ntype,exclude_el,properties):
    """
    Extracts information for compounds containing two elements.

    Parameters:
    -----------
    el1 : str
        First element to search for in compounds (e.g., 'B' for boron).
    el2 : str
        Second element to search for in compounds (e.g., 'C' for carbon).
    ntype : int or tuple
        Number of unique elements in compounds. Can be a single integer or a tuple (e.g., (1, 2) for 2 different types).
    exclude_el : list
        List of elements to exclude. For example, ['O', 'N'].
    properties : list
        List of properties to extract.

    Returns:
    --------
    data : pandas DataFrame
        DataFrame containing information for compounds with two elements.
    """
    data1=download(el1,ntype,exclude_el,properties)
    data2=download(el2,ntype,exclude_el,properties)
    data = data_combine(data1,data2)
    #data = stable(data)
    #data = metal_filter(data)
    #data = remove(data,'remove.list')
    return data



[docs]
def create_input():
    """
    Reads 'download.csv' file inside 'download' folder and creates 'input.in' and 'mpid-list.in' files
    for further downloading and calculations.
    """
    data_file = pd.DataFrame(pd.read_csv('download/download.csv'))
    nrow = data_file.shape[0]
    with open("mpid-list.in", "a") as mpfile_append:
        for i in range(nrow):
            mpid = data_file['material_id'][i]
            #comp = data_file['formula_pretty'][i]
            comp = data_file['composition'][i]
            mpfile_append.write("v{} {} {}".format(i+1,mpid,comp) + "\n")
    with open('input.in', 'w') as input_write:
        if os.path.isfile("config.json") or os.path.isfile("../../config.json"):
            #import download as d
            d = input_data['download']
            input_write.write(str(d['inp']['start']) + "\n")
            input_write.write(str(d['inp']['end']) + "\n")
            input_write.write("{} 0".format(d['inp']['nkpt']) + "\n")
            input_write.write("mpid-list.in\n")
            input_write.write("{}".format(d['inp']['plot']) + "\n")
            input_write.write("DFT = {}".format(d['inp']['calc']) + "\n")
        else:
            input_write.write(str(1) + "\n")
            input_write.write(str(nrow) + "\n")
            input_write.write("200 0\n")
            input_write.write("mpid-list.in\n")
            input_write.write("phband\n")
            input_write.write("DFT = QE\n")



[docs]
def extract(ntype,properties,elm,exclude_el,nelm=1,metal=False,
            neg_fe=False,
            thermo_stable=False,
            ordering='NM',
            nsites=10,
            spacegroup=None,
            out='download/download.csv'):
    """
    Function to extract the data and apply filters, then write 'download.csv' file inside 'download' folder.

    Parameters:
    -----------
    ntype : tuple
        Number of unique elements in the compound. For example: (1, 3) for 3 different unique elements in compounds.
    properties : list
        List of properties to extract.
    elm : list
        List of elements used in search.
    exclude_el : list
        List of elements to exclude.
    nelm : int, optional
        Length of list elm. Default is 1.
    metal : bool, optional
        True to download zero bandgap compounds. Default is False.
    neg_fe : bool, optional
        True to download compounds with negative formation energy. Default is False.
    thermo_stable : bool, optional
        True to download compounds at the convex hull. Default is False.
    ordering : str, optional
        Magnetic ordering of the compound. Default is 'NM'.
    nsites : int, optional
        Maximum number of sites in the compound. Default is 10.
    spacegroup : int or str, optional
        Spacegroup number or name. Default is None.
    out : str, optional
        Output file to write. Default is 'download/download.csv'.

    Returns:
    --------
    data : pandas DataFrame
        Extracted data after applying filters.
    """
    if os.path.isdir("download_old"):
        os.system("rm -r download_old")
    if os.path.isdir("download"):
        print("A download folder is found, renaming download_old\n")
        os.system("mv download download_old")
    if nelm == 1:
        data = data_one_element_compound(elm[0],ntype,exclude_el,properties)
    elif nelm == 2:
        data = data_two_element_compound(elm[0],elm[1],ntype,exclude_el,properties)
    else:
        print("Upto 2 elements are allowed\n")
    if metal:
        data = metal_filter(data)
    if neg_fe:
        data = stable(data)
    if thermo_stable:
        data = convexhull(data)
    data = data[data['ordering'] == ordering].reset_index(drop=True)
    data = data[data['nsites'] <= nsites].reset_index(drop=True)
    if spacegroup:
        data = data[data['spacegroup'] == spacegroup].reset_index(drop=True)
    os.system("rm download/data*")
    #data = remove(data, 'remove.list',elm)
    data.to_csv(out)
    return data



[docs]
def download_by_entry(entries,must_include,size_constraint=20,ntype_constraint=5,FE=False,thermo_stable=True,metal=False,magnetic=False,spacegroup=None,properties=None):
    """
    Function to extract and create input files using "mp_api.client.MPRester.get_entries_in_chemsys" Function of the materials project API package (pip install mp_api).
    This mode is turned on when using 'mode':'chemsys' in 'download.py' file.

    Parameters:
    -----------
    entries : list
        List of elements ==> elements and compounds (combination of elements) to search.
    size_constraint : int, optional
        Size of the compounds (total number of ions). Upper bound not included. Default is 20.
    ntype_constraint : int, optional
        Number of different types of ions. Upper bound not included. Default is 5.
    must_include : list
        Elements that must be included in the compounds.
    FE : bool, optional
        True if the formation energy is negative. Default is False.
    metal : bool, optional
        True if the compound is a metal. Default is False.
    magnetic : bool, optional
        True if the compound has a non-zero magnetic moment. Default is False.
    spacegroup : int or str, optional
        Spacegroup number or name. Default is None.
    properties : list, optional
        List of properties to extract.

    Returns:
    --------
    None
    """
    if os.path.isdir("download_old"):
        os.system("rm -r download_old")
    if os.path.isdir("download"):
        print("A download folder is found, renaming download_old\n")
        os.system("mv download download_old")
    parent_folder=os.getcwd()
    create_folder(parent_folder)
    obj = MpConnect()
    # Create condition for must_include elements
    must_in = ""
    for i,elm in enumerate(must_include):
        if i == len(must_include) - 1:
            must_in += "'{}'".format(elm) + " in elm_list"
        else:
            must_in += "'{}'".format(elm) + " in elm_list or "
    must_in = "nelm < ntype_constraint and " + "({})".format(must_in)
    # Get entries in chemical system
    entries = obj.mpr.get_entries_in_chemsys(entries)
    entry = 1
    # Write header to CSV file
    with open(parent_folder+"/download/"+ "download.csv", "w") as data_elm:
        for i,propti in enumerate(properties):
            if propti == "structure":
                propty = "spacegroup"
            else:
                propty = propti
            if i < len(properties) - 1:
                data_elm.write(propty + ",")
            else:
                data_elm.write(propty)
        data_elm.write("\n")
    # Iterate over entries
    for i,_ in enumerate(entries):
        # Extract data for each entries
        mpid = entries[i].data['material_id']
        obj.setting(mpid)
        band_gap = obj.data['band_gap']
        form_energy = obj.data['formation_energy_per_atom']
        ordering = obj.data['ordering']
        nelm = len(entries[i].composition.elements)
        comp = entries[i].composition.formula.replace(' ','')
        count = int(entries[i].composition.num_atoms)
        elm_list = list(entries[i].composition.as_dict().keys())
        energy_above_hull = obj.data['energy_above_hull']
        # Define logic for filtering based on optional parameters
        if isinstance(thermo_stable, bool):
            if thermo_stable:
                thermo_logic = energy_above_hull < 0.0001
            else:
                thermo_logic = True
        elif isinstance(thermo_stable, (int, float)):
            thermo_logic = energy_above_hull < thermo_stable
        else:
            thermo_logic = True
        if metal:
            gap_logic = band_gap < 0.0001
        else:
            gap_logic = True
        if FE:
            fe_logic = form_energy < 0.0
        else:
            fe_logic = True
        if not magnetic:
            mag_logic = ordering == 'NM'
        else:
            mag_logic = True
        if not spacegroup:
            sg_logic = True
        else:
            sg_logic = obj.data['symmetry']['symbol'] == spacegroup
        print("Extracting {}".format(comp) + "\n")
        # Write data to input file and CSV file if conditions are met
        with open("mpid-list.in", "a") as mplist_write:
            if count < size_constraint and gap_logic and fe_logic and mag_logic and sg_logic and thermo_logic:
                if eval(must_in):
                    mplist_write.write("v{} {} {}".format(entry,mpid,obj.prefix) + "\n")
                    entry += 1
        property_list = []
        with open(parent_folder+"/download/"+ "download.csv", "a") as data_elm:
            for propty in properties:
                if propty == "structure":
                    property_list.append(obj.data['symmetry']['symbol'])
                else:
                    property_list.append(obj.data[propty])
            for j,prop in enumerate(property_list):
                if j < len(property_list) - 1:
                    data_elm.write(str(prop) + ",")
                else:
                    data_elm.write(str(prop))
            data_elm.write("\n")


[docs]
def main():
    """
    Main function to orchestrate the data extraction and input file creation process.

    If 'mpid-list.in' file does not exist, the function reads settings from 'config.json' to
    create mpid-list.in file.

    Parameters:
    -----------------
    None

    Returns:
    -----------------
    None
    """
    # Check if 'mpid-list.in' file exists
    if not os.path.isfile("mpid-list.in"):
        # Check if 'config.json' exists
        if os.path.isfile("config.json") or os.path.isfile("../../config.json"):
            # Read settings from 'config.json' to create 'mpid-list.in' file
            d = input_data["download"]
            mode = d['mode']
            ntype = d['element']['ntype']
            exclude_el = d['element']['exclude']
            elm_list = d['element']['elm']
            nelm = len(elm_list)
            properties = d['element']['prop']
            metal=d['element']['metal']
            neg_fe=d['element']['FE']
            thermo_stable=d['element']['thermo_stable']
            ordering=d['element']['ordering']
            nsites=d['element']['nsites']
            spacegroup=d['element']['spacegroup']
        else:
            # Provide default settings if 'config.json' doesn't exist
            print("input file config.json not found\n")
            print("Create one with following format\n")
            msg="""element={'metal':True, 'FE':True, 'exclude':["O", "N", "F", "Cl", "Br", "I"],'ntype':(1,2), 'elm':['B'], 'prop':["material_id", "formula_pretty", "structure", "formation_energy_per_atom", "band_gap", "energy_above_hull","nsites","ordering","nsites"],'ordering':'NM','nsites':10,'spacegroup':None}
inp=    {'start':1, 'end':50, 'nkpt':200, 'evenkpt': False, 'plot':'phband', 'calc':'QE'}
chemsys={'entries':['B'],'size_constraint':20,'ntype_constraint':5,'must_include':['Mg'],'FE':False,'metal':False,'magnetic':False,'spacegroup':None}"""
            print(msg + "\n")
            print("Utilizing default settings\n")
            ntype = (1,2) #Number of different types of element in the compound.
            exclude_el = ["Lu"]
            nsites = 10
            #exclude_el = ["O", "N", "F", "Cl", "Br", "I"]
            elm = 'B'
            nelm = 1
            elm_list = [elm]
            metal = False
            neg_fe = False
            thermo_stable = False
            ordering = 'FM'
            spacegroup = None
            properties=["material_id", "formula_pretty", "structure", "formation_energy_per_atom", "band_gap", "energy_above_hull","total_magnetization","ordering",'total_magnetization_normalized_formula_units', 'num_magnetic_sites','theoretical','nsites']
            default1={'metal':metal,'FE':neg_fe, 'thermo_stable':thermo_stable, 'exclude':exclude_el,'ntype':(1,2),'elm':[elm],'prop':properties,'ordering':ordering,'nsites':nsites,'spacegroup':spacegroup}
            default2={'start':1, 'end':2, 'nkpt':200, 'evenkpt': False, 'plot':'phband','calc':'QE'}
            chemsys={'entries':['B','Mg'],'size_constraint':20,'ntype_constraint':5,'must_include':['Mg','B'],'FE':False,'metal':False, 'magnetic':False,'spacegroup':spacegroup}
            d = {
                 'element':default1,
                 'inp':default2,
                 'chemsys':chemsys
                 }
            # Default mode is 'element'
            mode = 'element'
        # Perform actions based on mode
        if mode == 'element':
            # Extract data and create input files
            data = extract(ntype,properties,elm_list,exclude_el,
                           nelm=nelm,
                           metal=metal,
                           neg_fe=neg_fe,
                           thermo_stable=thermo_stable,
                           ordering=ordering,
                           nsites=nsites,
                           spacegroup=spacegroup)
            create_input()
        elif mode == 'chemsys':
            # Download data for compounds based on chemical system
            download_by_entry(d['chemsys']['entries'],d['chemsys']['must_include'],d['chemsys']['size_constraint'],d['chemsys']['ntype_constraint'],
                              d['chemsys']['FE'],d['chemsys']['thermo_stable'],d['chemsys']['metal'],d['chemsys']['magnetic'],d['chemsys']['spacegroup'],properties)
        elif mode == 'fromcif':
            # List CIF files
            list_cif = glob.glob("*.cif",recursive=True)
            if len(list_cif) > 0:
                print("These cif files are found\n")
                for cif in list_cif:
                    print(cif + "\n")
        elif mode == 'fromvasp':
            # List VASP files
            list_vasp = glob.glob("*.vasp",recursive=True)
            if len(list_vasp) > 0:
                print("These .vasp files are found\n")
                for vasp in list_vasp:
                    print(vasp + "\n")
        else:
            print("mode = element, chemsys, fromcif, or fromvasp available\n")

if __name__ == "__main__":
    input_data = config()
    main()