{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from copy import deepcopy\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import calendar\n",
    "from pybliometrics.scopus import ScopusSearch\n",
    "from pybliometrics.scopus import AbstractRetrieval\n",
    "\n",
    "from tqdm import tqdm\n",
    "\n",
    "pd.options.display.max_columns = 30\n",
    "\n",
    "filename = \"./sample.txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "### language\n",
    "dic_language = {'eng': \"English\",\n",
    "                'kor': \"Korean\"\n",
    "               }\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1028, 34)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>eid</th>\n",
       "      <th>doi</th>\n",
       "      <th>pii</th>\n",
       "      <th>pubmed_id</th>\n",
       "      <th>title</th>\n",
       "      <th>subtype</th>\n",
       "      <th>subtypeDescription</th>\n",
       "      <th>creator</th>\n",
       "      <th>afid</th>\n",
       "      <th>affilname</th>\n",
       "      <th>affiliation_city</th>\n",
       "      <th>affiliation_country</th>\n",
       "      <th>author_count</th>\n",
       "      <th>author_names</th>\n",
       "      <th>author_ids</th>\n",
       "      <th>...</th>\n",
       "      <th>issn</th>\n",
       "      <th>source_id</th>\n",
       "      <th>eIssn</th>\n",
       "      <th>aggregationType</th>\n",
       "      <th>volume</th>\n",
       "      <th>issueIdentifier</th>\n",
       "      <th>article_number</th>\n",
       "      <th>pageRange</th>\n",
       "      <th>description</th>\n",
       "      <th>authkeywords</th>\n",
       "      <th>citedby_count</th>\n",
       "      <th>openaccess</th>\n",
       "      <th>fund_acr</th>\n",
       "      <th>fund_no</th>\n",
       "      <th>fund_sponsor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2-s2.0-85125145796</td>\n",
       "      <td>10.1016/j.egyr.2022.02.088</td>\n",
       "      <td>S2352484722003353</td>\n",
       "      <td>None</td>\n",
       "      <td>Data driven approach to forecast the next day ...</td>\n",
       "      <td>ar</td>\n",
       "      <td>Article</td>\n",
       "      <td>Fentis A.</td>\n",
       "      <td>60017798;127756529;116607338</td>\n",
       "      <td>Faculté des Sciences et Techniques de Mohammed...</td>\n",
       "      <td>Mohammedia;Mohammedia;Casablanca</td>\n",
       "      <td>Morocco;Morocco;Morocco</td>\n",
       "      <td>5</td>\n",
       "      <td>Fentis, Ayoub;Rafik, Mohamed;Bahatti, Lhoussai...</td>\n",
       "      <td>57195404384;57208745688;55837734100;6506839198...</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>21100389511</td>\n",
       "      <td>23524847</td>\n",
       "      <td>Journal</td>\n",
       "      <td>8</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>3221-3233</td>\n",
       "      <td>Photovoltaic (PV) power has became an attracti...</td>\n",
       "      <td>Machine learning | PV power forecasting | Rene...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>None</td>\n",
       "      <td>undefined</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2-s2.0-85123676147</td>\n",
       "      <td>10.1016/j.egyr.2022.01.105</td>\n",
       "      <td>S2352484722001056</td>\n",
       "      <td>None</td>\n",
       "      <td>Energy modeling and photovoltaics integration ...</td>\n",
       "      <td>ar</td>\n",
       "      <td>Article</td>\n",
       "      <td>Al Huneidi D.I.</td>\n",
       "      <td>60113885</td>\n",
       "      <td>Hamad Bin Khalifa University, College of Scien...</td>\n",
       "      <td>Doha</td>\n",
       "      <td>Qatar</td>\n",
       "      <td>3</td>\n",
       "      <td>Al Huneidi, Dana I.;Tahir, Furqan;Al-Ghamdi, S...</td>\n",
       "      <td>57389610600;57201605786;56439247500</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>21100389511</td>\n",
       "      <td>23524847</td>\n",
       "      <td>Journal</td>\n",
       "      <td>8</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>166-171</td>\n",
       "      <td>Due to Qatar's increasing population and elect...</td>\n",
       "      <td>Climate change | Demand side management | Desi...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>QNRF</td>\n",
       "      <td>NPRP12S-0212-190073</td>\n",
       "      <td>Qatar National Research Fund</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2-s2.0-85124409874</td>\n",
       "      <td>10.1016/j.engappai.2022.104707</td>\n",
       "      <td>S0952197622000288</td>\n",
       "      <td>None</td>\n",
       "      <td>Multi-quantile recurrent neural network for fe...</td>\n",
       "      <td>ar</td>\n",
       "      <td>Article</td>\n",
       "      <td>Zhang X.Y.</td>\n",
       "      <td>60020595;60001455</td>\n",
       "      <td>Royal Holloway, University of London;Anhui Uni...</td>\n",
       "      <td>Egham;Hefei</td>\n",
       "      <td>United Kingdom;China</td>\n",
       "      <td>3</td>\n",
       "      <td>Zhang, Xiao Yu;Watkins, Chris;Kuenzel, Stefanie</td>\n",
       "      <td>57239241400;57447667900;55875409300</td>\n",
       "      <td>...</td>\n",
       "      <td>09521976</td>\n",
       "      <td>24182</td>\n",
       "      <td>None</td>\n",
       "      <td>Journal</td>\n",
       "      <td>110</td>\n",
       "      <td>None</td>\n",
       "      <td>104707</td>\n",
       "      <td>None</td>\n",
       "      <td>The purpose of feeder-level energy disaggregat...</td>\n",
       "      <td>Behind-the-meter PV generation | Deep neural n...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>undefined</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 34 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  eid                             doi                pii  \\\n",
       "0  2-s2.0-85125145796      10.1016/j.egyr.2022.02.088  S2352484722003353   \n",
       "1  2-s2.0-85123676147      10.1016/j.egyr.2022.01.105  S2352484722001056   \n",
       "2  2-s2.0-85124409874  10.1016/j.engappai.2022.104707  S0952197622000288   \n",
       "\n",
       "  pubmed_id                                              title subtype  \\\n",
       "0      None  Data driven approach to forecast the next day ...      ar   \n",
       "1      None  Energy modeling and photovoltaics integration ...      ar   \n",
       "2      None  Multi-quantile recurrent neural network for fe...      ar   \n",
       "\n",
       "  subtypeDescription          creator                          afid  \\\n",
       "0            Article        Fentis A.  60017798;127756529;116607338   \n",
       "1            Article  Al Huneidi D.I.                      60113885   \n",
       "2            Article       Zhang X.Y.             60020595;60001455   \n",
       "\n",
       "                                           affilname  \\\n",
       "0  Faculté des Sciences et Techniques de Mohammed...   \n",
       "1  Hamad Bin Khalifa University, College of Scien...   \n",
       "2  Royal Holloway, University of London;Anhui Uni...   \n",
       "\n",
       "                   affiliation_city      affiliation_country author_count  \\\n",
       "0  Mohammedia;Mohammedia;Casablanca  Morocco;Morocco;Morocco            5   \n",
       "1                              Doha                    Qatar            3   \n",
       "2                       Egham;Hefei     United Kingdom;China            3   \n",
       "\n",
       "                                        author_names  \\\n",
       "0  Fentis, Ayoub;Rafik, Mohamed;Bahatti, Lhoussai...   \n",
       "1  Al Huneidi, Dana I.;Tahir, Furqan;Al-Ghamdi, S...   \n",
       "2    Zhang, Xiao Yu;Watkins, Chris;Kuenzel, Stefanie   \n",
       "\n",
       "                                          author_ids  ...      issn  \\\n",
       "0  57195404384;57208745688;55837734100;6506839198...  ...      None   \n",
       "1                57389610600;57201605786;56439247500  ...      None   \n",
       "2                57239241400;57447667900;55875409300  ...  09521976   \n",
       "\n",
       "     source_id     eIssn aggregationType volume issueIdentifier  \\\n",
       "0  21100389511  23524847         Journal      8            None   \n",
       "1  21100389511  23524847         Journal      8            None   \n",
       "2        24182      None         Journal    110            None   \n",
       "\n",
       "  article_number  pageRange  \\\n",
       "0           None  3221-3233   \n",
       "1           None    166-171   \n",
       "2         104707       None   \n",
       "\n",
       "                                         description  \\\n",
       "0  Photovoltaic (PV) power has became an attracti...   \n",
       "1  Due to Qatar's increasing population and elect...   \n",
       "2  The purpose of feeder-level energy disaggregat...   \n",
       "\n",
       "                                        authkeywords citedby_count openaccess  \\\n",
       "0  Machine learning | PV power forecasting | Rene...             0          1   \n",
       "1  Climate change | Demand side management | Desi...             1          1   \n",
       "2  Behind-the-meter PV generation | Deep neural n...             0          0   \n",
       "\n",
       "  fund_acr              fund_no                  fund_sponsor  \n",
       "0     None            undefined                          None  \n",
       "1     QNRF  NPRP12S-0212-190073  Qatar National Research Fund  \n",
       "2     None            undefined                          None  \n",
       "\n",
       "[3 rows x 34 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sample = pd.read_pickle(\"./df_sample.pkl\")\n",
    "print(df_sample.shape)\n",
    "df_sample.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns=[\"eid\", \"PT\", \"AU\", \"AF\", \"TI\", \n",
    "         \"SO\", \"SO_abb\", \"LA\", \"DT\", \"DE\", \n",
    "         \"ID\", \"AB\", \"C1\", \"RP\", \"EM\", \n",
    "         \"CR\", \"NR\", \"TC\", \"Z9\", \"SN\", \n",
    "         \"J9\", \"JI\", \"PD\", \"PY\", \"VL\", \n",
    "         \"AR\", \"DI\", \"SC\"]\n",
    "\n",
    "df_ab = pd.DataFrame(columns=columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "looping over df_sample: 100%|██████████| 1028/1028 [00:17<00:00, 58.24it/s]\n"
     ]
    }
   ],
   "source": [
    "starting_index = 0\n",
    "data_index = list(range(starting_index, df_sample.shape[0]))\n",
    "\n",
    "for art in tqdm(data_index, desc=\"looping over df_sample\"):\n",
    "\n",
    "    eid = df_sample.loc[art, \"eid\"]\n",
    "    ab = AbstractRetrieval(eid, view=\"FULL\")\n",
    "\n",
    "    # 1. PT: publication type\n",
    "    docu_type_ = ab.srctype\n",
    "\n",
    "    # 2. AU: index name\n",
    "    index_name_ = [a.indexed_name for a in ab.authors] if ab.authors else [None]\n",
    "\n",
    "    # 3. AF: author name\n",
    "    author_name_ = [f\"{a.surname}, {a.given_name}\" for a in ab.authors] if ab.authors else [None]\n",
    "\n",
    "    # 4. TI: document title\n",
    "    docu_title_ = df_sample.loc[art, \"title\"]\n",
    "\n",
    "    # 5. SO: publication name\n",
    "    src_title_ = ab.publicationName\n",
    "\n",
    "    # 5-1. publication abbr.\n",
    "    src_abb_ = ab.sourcetitle_abbreviation\n",
    "\n",
    "    # 6. LA : Language\n",
    "    try:\n",
    "        language_ = dic_language[ab.language]\n",
    "    except:\n",
    "        language_ = \"unknown\"\n",
    "\n",
    "    # 7. DT : Document Type\n",
    "#     docu_type_ = ab.subtype\n",
    "    docu_type_ = df_sample.loc[art, \"subtype\"]\n",
    "\n",
    "    # 8. DE : Author Keywords\n",
    "    auth_kw_ = ab.authkeywords\n",
    "\n",
    "    if not auth_kw_:\n",
    "        auth_kw_ = 'None'\n",
    "    else:\n",
    "        auth_kw_ = '; '.join(auth_kw_)\n",
    "\n",
    "    # 9. ID : Keyword Plus\n",
    "    kw_plus_ = 'None'\n",
    "\n",
    "    # 10. AB : Abstract\n",
    "    abstract_ = ab.abstract\n",
    "\n",
    "    # 11. C1 : Author Address\n",
    "#     if ab.authorgroup:\n",
    "#         tmp = pd.DataFrame(ab.authorgroup)\n",
    "#         grouped = tmp.groupby('organization').agg(lambda x: list(x))\n",
    "\n",
    "#         aff_names = [str(aff) for aff in grouped[\"affiliation_id\"].index.tolist()]\n",
    "#         aff_ids = grouped[\"affiliation_id\"].tolist()\n",
    "#         citys = grouped[\"city\"].tolist()\n",
    "#         countrys = grouped[\"country\"].tolist()\n",
    "#         auids = grouped[\"auid\"].tolist()\n",
    "#         indexed_names = grouped[\"indexed_name\"].tolist()\n",
    "\n",
    "#         address = []\n",
    "#         for aff_name, aff_id, city, country in zip(aff_names, aff_ids, citys, countrys):\n",
    "#             if isinstance(aff_id, list):\n",
    "#                 aff_id, city, country = aff_id[0], city[0], country[0]\n",
    "#             address.append(f\"{aff_id}, {aff_name}, {city}, {country}\")\n",
    "\n",
    "#         addresss_ = list(zip(auids, indexed_names, address))\n",
    "    if ab.affiliation and ab.authors:\n",
    "        df_aff = pd.DataFrame(ab.affiliation)\n",
    "        df_aff[\"id\"] = df_aff[\"id\"].astype(str)\n",
    "        df_auth = pd.DataFrame(ab.authors)\n",
    "        df_authgroup = df_auth.explode(\"affiliation\").groupby(\"affiliation\").agg(list).reset_index()\n",
    "\n",
    "        df_aff = pd.merge(df_aff, df_authgroup, left_on=\"id\", right_on=\"affiliation\").drop([\"affiliation\"], axis=1)\n",
    "        df_aff[\"address\"] = df_aff[['id', 'name', 'city', 'country']].apply(lambda x: ', '.join(x.astype(str)), axis=1)\n",
    "        addresss_ = df_aff[[\"auid\", \"indexed_name\", \"address\"]].values.tolist()\n",
    "    else:\n",
    "        addresss_ = []\n",
    "\n",
    "    # df_aff = pd.DataFrame(ab.authorgroup)\n",
    "    # grouped = df_aff.groupby(\"affiliation_id\")\n",
    "    # grouped[[\"organization\", \"affiliation_id\", \"city\", \"country\"]]\n",
    "\n",
    "    # 12. RP : Reprint Address\n",
    "    rep_addr_ = \"None\"\n",
    "\n",
    "    # 13. EM : E-mail Address\n",
    "    em_addr_ = \"None\"\n",
    "\n",
    "    # 14. CR : Cited References\n",
    "    refs_ = []\n",
    "    if ab.references != None:\n",
    "        tmp = pd.DataFrame(ab.references)\n",
    "        refcount = int(ab.refcount)\n",
    "\n",
    "        for i in range(refcount):\n",
    "            tmp_ = tmp.iloc[i]\n",
    "            tmp_authors = tmp_['authors']\n",
    "            if tmp_authors == None:\n",
    "                tmp_authors = \"[Anonymous]\"\n",
    "            tmp_year = tmp_['publicationyear']\n",
    "            tmp_src = tmp_['sourcetitle']\n",
    "            tmp_vol = tmp_['volume']\n",
    "            tmp_page = tmp_['first']\n",
    "            tmp_doi = tmp_['doi']\n",
    "\n",
    "            ref = tmp_authors\n",
    "            for item in [tmp_year, tmp_src, tmp_vol, tmp_page]:\n",
    "                if item != None:\n",
    "                    ref = ', '.join([ref, item])\n",
    "            if tmp_doi != None:\n",
    "                ref = ref + f\", DOI {tmp_doi}\"\n",
    "\n",
    "            if i == 0:\n",
    "                refs_.append(f\"CR {ref}\")\n",
    "            else:\n",
    "                refs_.append(f\"   {ref}\")\n",
    "\n",
    "    # 15. NR : Cited Reference Count\n",
    "    nr_ = ab.refcount\n",
    "\n",
    "    # 16. TC : Web of Science Core Collection Times Cited Count\n",
    "    tc_ = ab.citedby_count\n",
    "\n",
    "    # 17. Z9 : Total Times Cited Count\n",
    "    cc_ = tc_\n",
    "\n",
    "    # 18. U1 : Usage Count (Last 180 Days)\n",
    "    # 19. U2 : Usage Count (Since 2013)\n",
    "    # 20. PU : Publisher = ELSEVIER SCI LTD\n",
    "    # 21. PI : Publisher City = OXFORD\n",
    "    # 22. PA : Publisher Address = THE BOULEVARD, LANGFORD LANE, KIDLINGTON, OXFORD OX5 1GB, OXON, ENGLAND\n",
    "    # 23. SN : International Standard Serial Number (ISSN) = 0959-6526\n",
    "    sn_ = \"None\" if ab.issn == None else ab.issn\n",
    "\n",
    "    # 24. EI : Electronic International Standard Serial Number (eISSN) = 1879-1786\n",
    "    # 25. J9 : 29-Character Source Abbreviation = J CLEAN PROD\n",
    "    j9_ = ab.sourcetitle_abbreviation if ab.sourcetitle_abbreviation != None else \"None\"\n",
    "    j9_ = j9_.upper()\n",
    "\n",
    "    # 26. JI : ISO Source Abbreviation = J. Clean Prod.\n",
    "    ji_ = ab.sourcetitle_abbreviation if ab.sourcetitle_abbreviation != None else \"None\"\n",
    "\n",
    "    # 27. PD : Publication Date = JUL 1\n",
    "    month = ab.coverDate.split('-')[1]\n",
    "    date = ab.coverDate.split('-')[2]\n",
    "    pd_ = f\"{calendar.month_name[int(month)][:3].upper()} {int(date)}\"\n",
    "\n",
    "    # 28. PY : Publication Year = 2020\n",
    "    py_ = ab.coverDate.split('-')[0]\n",
    "\n",
    "    # 29. VL : Volumn = 260\n",
    "    vl_ = ab.volume\n",
    "\n",
    "    # 30. AR : Article Number = 121059\n",
    "    ar_ = df_sample.loc[art, \"article_number\"] if df_sample.loc[art, \"article_number\"] != None else \"None\"\n",
    "\n",
    "    # 31. DI : Digital Object Identifier = 10.1016/j.jclepro.2020.121059\n",
    "    doi_ = ab.doi\n",
    "\n",
    "    # 32. PG : Page Count = 14\n",
    "    # 33. WC : Web of Science Categories = Green & Sustainable Science & Technology; Engineering, Environmental; Environmental Sciences\n",
    "    # 34. SC : Research Areas = Science & Technology - Other Topics; Engineering; Environmental Sciences & Ecology\n",
    "    if ab.subject_areas:        \n",
    "        tmp = pd.DataFrame(ab.subject_areas)\n",
    "        tmp_ = tmp[\"area\"].tolist()\n",
    "        sc_ = \"; \".join(tmp_)\n",
    "    else: \n",
    "        sc_ = [None]\n",
    "\n",
    "    # 35. GA : Document Delivery Number = LL4XH\n",
    "    # 36. UT : Accession Number = WOS:000531559900003\n",
    "    # 37. DA : Date this report was generated. = 2020-06-14\n",
    "    \n",
    "    # summation\n",
    "    data=[eid, docu_type_, index_name_, author_name_, docu_title_, \n",
    "                                src_title_, src_abb_, language_, docu_type_, auth_kw_, \n",
    "                                kw_plus_, abstract_, addresss_, rep_addr_, em_addr_, \n",
    "                                refs_, nr_, tc_, cc_, sn_, \n",
    "                                j9_, ji_, pd_, py_, vl_, \n",
    "                                ar_, doi_, sc_]\n",
    "    \n",
    "    df_ab_tmp = pd.DataFrame(dict(zip(columns, [[d] for d in data])))\n",
    "    df_ab = pd.concat([df_ab, df_ab_tmp], axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1028, 28)"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ab.reset_index(drop=True).to_pickle(\"df_ab.pkl\")\n",
    "df_ab.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}