In [5]:
from copy import deepcopy
import numpy as np
import pandas as pd
import calendar
from pybliometrics.scopus import ScopusSearch
from pybliometrics.scopus import AbstractRetrieval

from tqdm import tqdm

pd.options.display.max_columns = 30

filename = "BIPV_ML.txt"

In [15]:
df_sample = pd.read_pickle("df_sample.pkl")
df_sample.head(3)

Unnamed: 0,eid,doi,pii,pubmed_id,title,subtype,subtypeDescription,creator,afid,affilname,affiliation_city,affiliation_country,author_count,author_names,author_ids,...,issn,source_id,eIssn,aggregationType,volume,issueIdentifier,article_number,pageRange,description,authkeywords,citedby_count,openaccess,fund_acr,fund_no,fund_sponsor
0,2-s2.0-85125145796,10.1016/j.egyr.2022.02.088,S2352484722003353,,Data driven approach to forecast the next day ...,ar,Article,Fentis A.,60017798;127756529;116607338,Faculté des Sciences et Techniques de Mohammed...,Mohammedia;Mohammedia;Casablanca,Morocco;Morocco;Morocco,5,"Fentis, Ayoub;Rafik, Mohamed;Bahatti, Lhoussai...",57195404384;57208745688;55837734100;6506839198...,...,,21100389511,23524847.0,Journal,8,,,3221-3233,Photovoltaic (PV) power has became an attracti...,Machine learning | PV power forecasting | Rene...,0,1,,undefined,
1,2-s2.0-85123676147,10.1016/j.egyr.2022.01.105,S2352484722001056,,Energy modeling and photovoltaics integration ...,ar,Article,Al Huneidi D.I.,60113885,"Hamad Bin Khalifa University, College of Scien...",Doha,Qatar,3,"Al Huneidi, Dana I.;Tahir, Furqan;Al-Ghamdi, S...",57389610600;57201605786;56439247500,...,,21100389511,23524847.0,Journal,8,,,166-171,Due to Qatar's increasing population and elect...,Climate change | Demand side management | Desi...,1,1,QNRF,NPRP12S-0212-190073,Qatar National Research Fund
2,2-s2.0-85124409874,10.1016/j.engappai.2022.104707,S0952197622000288,,Multi-quantile recurrent neural network for fe...,ar,Article,Zhang X.Y.,60020595;60001455,"Royal Holloway, University of London;Anhui Uni...",Egham;Hefei,United Kingdom;China,3,"Zhang, Xiao Yu;Watkins, Chris;Kuenzel, Stefanie",57239241400;57447667900;55875409300,...,9521976.0,24182,,Journal,110,,104707.0,,The purpose of feeder-level energy disaggregat...,Behind-the-meter PV generation | Deep neural n...,0,0,,undefined,


In [16]:
df_ab = pd.read_pickle("df_ab.pkl")
df_ab.shape

(1028, 28)

In [17]:
# publication type
# "J" = Journal
# "B" = Book
# "S" = Series
# "P" = Patent
dict_pubtype = {'ar': 'J',    # 'Article',
                'ab': 'J',    # 'Abstract Report',
                'bk': 'B',    # 'Book',
                'ch': 'B',    # 'Book Chapter',
                'bz': 'J',    # 'Business Article',
                'cp': 'J',    # 'Conference Paper',
                'cr': 'J',    # 'Conference Review',
                'dp': 'J',    # 'Data Paper ',
                'ed': 'S',    # 'Editorial',
                'er': 'J',    # 'Erratum',
                'le': 'J',    # 'Letter',
                'no': 'S',    # 'Note',
                'pr': 'S',    # 'Press Release',
                'rp': 'J',    # 'Report',
                'tb': 'J',    # 'Retracted',
                're': 'J',    # 'Review',
                'sh': 'J',    # 'Short Survey'
                'ip': 'P'     # 'Patent' 
               }

# documentation type
dict_docutype = {'ar': 'Article',
                 'ab': 'Abstract Report',
                 'bk': 'Book',
                 'ch': 'Book Chapter',
                 'bz': 'Business Article',
                 'cp': 'Conference Paper',
                 'cr': 'Conference Review',
                 'dp': 'Data Paper ',
                 'ed': 'Editorial',
                 'er': 'Erratum',
                 'le': 'Letter',
                 'no': 'Note',
                 'pr': 'Press Release',
                 'rp': 'Report',
                 'tb': 'Retracted',
                 're': 'Review',
                 'sh': 'Short Survey',
                 'ip': 'Patent'
                }

# text cleaning
dict_clean_text = {"&amp;": "&", 
                   "&nbsp;": " ", 
                   "&lt;": "<",
                   "&gt;": ">"
                  }
def get_clean_text(text):
    for k, v in dict_clean_text.items():
        text = text.replace(k, v)
    return text

In [18]:
# all

import os, calendar

with open(filename, "w") as datafile:
    datafile.write("FN Clarivate Analytics Web of Science\nVR 1.0")
    
    for art in df_ab.index:
        df_row = df_ab.loc[art]
        
        # 1. PT: publication type
        pubtype_ = dict_pubtype[df_row['PT']]
        datafile.write(f"\nPT {pubtype_}")
        
        # 2. AU: author names
        author_name_ = df_row['AU']
        author_name = "\nAU "
        if all(author_name_):
            author_name += "\n   ".join(author_name_)
            
        datafile.write(author_name)
        
        # 3. AF: affiliations
        affiliation_ = df_row['AF']
        affiliation = "\nAF "
        if all(affiliation_):
            affiliation += "\n   ".join(affiliation_)
        datafile.write(affiliation)
        
        # 4. TI: document title
        title = '\nTI '
        title_ = df_row['TI']
        if title_:
            title += get_clean_text(title_)
        datafile.write(title)
        
        # 5. SO: publication name
        so = '\nSO '
        so_ = df_row['SO']
        if so_:
            so += get_clean_text(so_)
        datafile.write(so)
        
        # 6. LA : Language
        datafile.write("\nLA " + df_row["LA"])
        
        # 7. DT : Document Type
        docutype_ = dict_docutype[df_row['DT']]
        datafile.write(f"\nDT {docutype_}")
        
        # 8. DE : Author Keywords
        de_ = df_row['DE']
        datafile.write(f"\nDE {de_}")
        
        # 9. ID : Keyword Plus
        id_ = df_row['ID']
        datafile.write(f"\nID {id_}")
        
        # 10. AB: Abstract
        ab_ = df_row['AB']
        datafile.write(f"\nAB {ab_}")
        
        # 11. C1 : Author Address
        c1_ = df_row['C1']
        c1 = [f"[{'; '.join(c[1])}] {', '.join(c[2].split(', ')[1:])}" for c in c1_]
        c1 = ".\n   ".join(c1) + "."
        datafile.write("\nC1 " + c1)
        
        # 12. RP : Reprint Address
        datafile.write("\nRP None")

        # 13. EM : E-mail Address
        datafile.write("\nEM None")
        
        # 14. CR : Cited References
        cr_ = df_row["CR"]
        if len(cr_) > 0:
            cr_[0] = "  " + cr_[0][2:]
            cr_ = [c for c in cr_ if len(c) > 5]
            cr_ = [c.lstrip(" ") for c in cr_ if (c[3] != ',') or ("DOI" in c)]
        datafile.write("\nCR "+"\n   ".join(cr_))
        
        # 15. NR : Cited Reference Count
        datafile.write(f"\nNR {df_row['NR']}")
        
        # 16. TC : Web of Science Core Collection Times Cited Count
        datafile.write(f"\nTC {df_row['TC']}")
        
        # 17. Z9 : Total Times Cited Count
        z9_ = df_row['Z9']
        datafile.write(f"\nZ9 {z9_}")
        
        # 18. U1 : Usage Count (Last 180 Days)
        # 19. U2 : Usage Count (Since 2013)
        # 20. PU : Publisher = ELSEVIER SCI LTD
        # 21. PI : Publisher City = OXFORD
        # 22. PA : Publisher Address = THE BOULEVARD, LANGFORD LANE, KIDLINGTON, OXFORD OX5 1GB, OXON, ENGLAND
        # 23. SN : International Standard Serial Number (ISSN) = 0959-6526
        sn_ = df_row["SN"]
        datafile.write(f"\nSN {sn_}")
        
        # 24. EI : Electronic International Standard Serial Number (eISSN) = 1879-1786
        # 25. J9 : 29-Character Source Abbreviation = J CLEAN PROD
        j9_ = df_row["J9"]
        datafile.write(f"\nJ9 {j9_}")
        
        # 26. JI : ISO Source Abbreviation = J. Clean Prod.
        ji_ = df_row["JI"]
        datafile.write(f"\nJI {ji_}")
        
        # 27. PD : Publication Date = JUL 1
        pd_ = df_row["PD"]
        datafile.write(f"\nPD {pd_}")
        
        # 28. PY : Publication Year = 2020
        py_ = df_row["PY"]
        datafile.write(f"\nPY {py_}")
        
        # 29. VL : Volumn = 260
        vl_ = df_row["VL"]
        datafile.write(f"\nVL {vl_}")
        
        # 30. AR : Article Number = 121059
        ar_ = df_row["AR"]
        datafile.write(f"\nAR {ar_}")
        
        # 31. DI : Digital Object Identifier = 10.1016/j.jclepro.2020.121059
        doi_ = df_row["DI"]
        datafile.write(f"\nDI {doi_}")
        
        # 32. PG : Page Count = 14
        # 33. WC : Web of Science Categories = Green & Sustainable Science & Technology; Engineering, Environmental; Environmental Sciences
        # 34. SC : Research Areas = Science & Technology - Other Topics; Engineering; Environmental Sciences & Ecology
        sc_ = df_row["SC"]
        datafile.write(f"\nSC {sc_}")
        
        # end 
        datafile.write(f"\nER\n")
        
    datafile.write("\nEF\n")