Skip to content

Data sources

WikipediaLoader

Bases: BaseWebAPIDataLoader

Source code in autoresearcher/data_sources/web_apis/wikipedia_loader.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class WikipediaLoader(BaseWebAPIDataLoader):
    def __init__(self):
        super().__init__("https://en.wikipedia.org/w/api.php")

    def fetch_data(self, search_query, results=10, language="en"):
        """
        Fetches data from the Wikipedia API.
        Args:
          search_query (str): The query to search for.
          results (int, optional): The maximum number of results to return. Defaults to 10.
          language (str, optional): The language to search in. Defaults to "en".
        Returns:
          list: A list of dictionaries containing the data for each result.
        Raises:
          wikipedia.exceptions.DisambiguationError: If the search query returns a disambiguation page.
        Examples:
          >>> loader = WikipediaLoader()
          >>> loader.fetch_data("Python")
          [
            {
              "title": "Python (programming language)",
              "url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
              "summary": "Python is an interpreted, high-level, general-purpose programming language.",
              "content": "Python is an interpreted, high-level, general-purpose programming language...",
              "categories": ["Programming languages"],
              "references": ["https://www.python.org/"]
            }
          ]
        """
        wikipedia.set_lang(language)
        wikipedia.set_rate_limiting(True)

        search_results = wikipedia.search(search_query, results=results)
        data = []

        for result in search_results:
            try:
                page = wikipedia.page(result)
                data.append(
                    {
                        "title": page.title,
                        "url": page.url,
                        "summary": page.summary,
                        "content": page.content,
                        "categories": page.categories,
                        "references": page.references,
                    }
                )
            except wikipedia.exceptions.DisambiguationError as e:
                # Handle disambiguation pages by selecting the first option
                if e.options:
                    page = wikipedia.page(e.options[0])
                    data.append(
                        {
                            "title": page.title,
                            "url": page.url,
                            "summary": page.summary,
                            "content": page.content,
                            "categories": page.categories,
                            "references": page.references,
                        }
                    )
            except wikipedia.exceptions.PageError:
                # Skip pages that cannot be found
                continue

        return data

fetch_data(search_query, results=10, language='en')

Fetches data from the Wikipedia API. Args: search_query (str): The query to search for. results (int, optional): The maximum number of results to return. Defaults to 10. language (str, optional): The language to search in. Defaults to "en". Returns: list: A list of dictionaries containing the data for each result. Raises: wikipedia.exceptions.DisambiguationError: If the search query returns a disambiguation page. Examples:

loader = WikipediaLoader() loader.fetch_data("Python") [ { "title": "Python (programming language)", "url": "https://en.wikipedia.org/wiki/Python_(programming_language)", "summary": "Python is an interpreted, high-level, general-purpose programming language.", "content": "Python is an interpreted, high-level, general-purpose programming language...", "categories": ["Programming languages"], "references": ["https://www.python.org/"] } ]

Source code in autoresearcher/data_sources/web_apis/wikipedia_loader.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def fetch_data(self, search_query, results=10, language="en"):
    """
    Fetches data from the Wikipedia API.
    Args:
      search_query (str): The query to search for.
      results (int, optional): The maximum number of results to return. Defaults to 10.
      language (str, optional): The language to search in. Defaults to "en".
    Returns:
      list: A list of dictionaries containing the data for each result.
    Raises:
      wikipedia.exceptions.DisambiguationError: If the search query returns a disambiguation page.
    Examples:
      >>> loader = WikipediaLoader()
      >>> loader.fetch_data("Python")
      [
        {
          "title": "Python (programming language)",
          "url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
          "summary": "Python is an interpreted, high-level, general-purpose programming language.",
          "content": "Python is an interpreted, high-level, general-purpose programming language...",
          "categories": ["Programming languages"],
          "references": ["https://www.python.org/"]
        }
      ]
    """
    wikipedia.set_lang(language)
    wikipedia.set_rate_limiting(True)

    search_results = wikipedia.search(search_query, results=results)
    data = []

    for result in search_results:
        try:
            page = wikipedia.page(result)
            data.append(
                {
                    "title": page.title,
                    "url": page.url,
                    "summary": page.summary,
                    "content": page.content,
                    "categories": page.categories,
                    "references": page.references,
                }
            )
        except wikipedia.exceptions.DisambiguationError as e:
            # Handle disambiguation pages by selecting the first option
            if e.options:
                page = wikipedia.page(e.options[0])
                data.append(
                    {
                        "title": page.title,
                        "url": page.url,
                        "summary": page.summary,
                        "content": page.content,
                        "categories": page.categories,
                        "references": page.references,
                    }
                )
        except wikipedia.exceptions.PageError:
            # Skip pages that cannot be found
            continue

    return data

SemanticScholarLoader

Bases: BaseWebAPIDataLoader

Source code in autoresearcher/data_sources/web_apis/semantic_scholar_loader.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class SemanticScholarLoader(BaseWebAPIDataLoader):
    def __init__(self):
        """
        Initializes the SemanticScholarLoader class.
        Args:
          None
        Returns:
          None
        Notes:
          Calls the superclass constructor with the SemanticScholar API URL.
        """
        super().__init__("https://api.semanticscholar.org/graph/v1/paper/search")

    def fetch_data(self, search_query, limit=100, year_range=None):
        """
        Fetches data from the SemanticScholar API.
        Args:
          search_query (str): The query to search for.
          limit (int, optional): The maximum number of results to return. Defaults to 100.
          year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None.
        Returns:
          list: A list of paper objects.
        Examples:
          >>> fetch_data("machine learning", limit=50, year_range=(2010, 2020))
          [{...}, {...}, ...]
        """
        params = {
            "query": search_query,
            "limit": limit,
            "fields": "title,url,abstract,authors,citationStyles,journal,citationCount,year,externalIds",
        }

        if year_range is not None:
            params["year"] = year_range

        data = self.make_request("", params=params)
        return data.get("data", [])

    def fetch_and_sort_papers(
        self,
        search_query,
        limit=100,
        top_n=20,
        year_range=None,
        keyword_combinations=None,
        weight_similarity=0.5,
    ):
        """
        Fetches and sorts papers from the SemanticScholar API.
        Args:
          search_query (str): The query to search for.
          limit (int, optional): The maximum number of results to return. Defaults to 100.
          top_n (int, optional): The maximum number of results to return after sorting. Defaults to 20.
          year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None.
          keyword_combinations (list, optional): A list of keyword combinations to search for. Defaults to None.
          weight_similarity (float, optional): The weight to give to the similarity score when sorting. Defaults to 0.5.
        Returns:
          list: A list of the top `top_n` paper objects sorted by combined score.
        Examples:
          >>> fetch_and_sort_papers("machine learning", limit=50, top_n=10, year_range=(2010, 2020))
          [{...}, {...}, ...]
        """
        papers = []
        if keyword_combinations is None:
            keyword_combinations = [search_query]

        for combination in keyword_combinations:
            papers.extend(self.fetch_data(combination, limit, year_range))

        max_citations = max(papers, key=lambda x: x["citationCount"])["citationCount"]

        for paper in papers:
            similarity = jellyfish.jaro_similarity(search_query, paper["title"])
            normalized_citation_count = paper["citationCount"] / max_citations
            paper["combined_score"] = (weight_similarity * similarity) + (
                (1 - weight_similarity) * normalized_citation_count
            )

        sorted_papers = sorted(papers, key=lambda x: x["combined_score"], reverse=True)

        # deduplicate paper entries prior to taking top n results
        sorted_dedup_papers = list(
            {each_paper["paperId"]: each_paper for each_paper in sorted_papers}.values()
        )

        return sorted_dedup_papers[:top_n]

__init__()

Initializes the SemanticScholarLoader class. Args: None Returns: None Notes: Calls the superclass constructor with the SemanticScholar API URL.

Source code in autoresearcher/data_sources/web_apis/semantic_scholar_loader.py
 8
 9
10
11
12
13
14
15
16
17
18
def __init__(self):
    """
    Initializes the SemanticScholarLoader class.
    Args:
      None
    Returns:
      None
    Notes:
      Calls the superclass constructor with the SemanticScholar API URL.
    """
    super().__init__("https://api.semanticscholar.org/graph/v1/paper/search")

fetch_and_sort_papers(search_query, limit=100, top_n=20, year_range=None, keyword_combinations=None, weight_similarity=0.5)

Fetches and sorts papers from the SemanticScholar API. Args: search_query (str): The query to search for. limit (int, optional): The maximum number of results to return. Defaults to 100. top_n (int, optional): The maximum number of results to return after sorting. Defaults to 20. year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None. keyword_combinations (list, optional): A list of keyword combinations to search for. Defaults to None. weight_similarity (float, optional): The weight to give to the similarity score when sorting. Defaults to 0.5. Returns: list: A list of the top top_n paper objects sorted by combined score. Examples:

fetch_and_sort_papers("machine learning", limit=50, top_n=10, year_range=(2010, 2020)) [{...}, {...}, ...]

Source code in autoresearcher/data_sources/web_apis/semantic_scholar_loader.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def fetch_and_sort_papers(
    self,
    search_query,
    limit=100,
    top_n=20,
    year_range=None,
    keyword_combinations=None,
    weight_similarity=0.5,
):
    """
    Fetches and sorts papers from the SemanticScholar API.
    Args:
      search_query (str): The query to search for.
      limit (int, optional): The maximum number of results to return. Defaults to 100.
      top_n (int, optional): The maximum number of results to return after sorting. Defaults to 20.
      year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None.
      keyword_combinations (list, optional): A list of keyword combinations to search for. Defaults to None.
      weight_similarity (float, optional): The weight to give to the similarity score when sorting. Defaults to 0.5.
    Returns:
      list: A list of the top `top_n` paper objects sorted by combined score.
    Examples:
      >>> fetch_and_sort_papers("machine learning", limit=50, top_n=10, year_range=(2010, 2020))
      [{...}, {...}, ...]
    """
    papers = []
    if keyword_combinations is None:
        keyword_combinations = [search_query]

    for combination in keyword_combinations:
        papers.extend(self.fetch_data(combination, limit, year_range))

    max_citations = max(papers, key=lambda x: x["citationCount"])["citationCount"]

    for paper in papers:
        similarity = jellyfish.jaro_similarity(search_query, paper["title"])
        normalized_citation_count = paper["citationCount"] / max_citations
        paper["combined_score"] = (weight_similarity * similarity) + (
            (1 - weight_similarity) * normalized_citation_count
        )

    sorted_papers = sorted(papers, key=lambda x: x["combined_score"], reverse=True)

    # deduplicate paper entries prior to taking top n results
    sorted_dedup_papers = list(
        {each_paper["paperId"]: each_paper for each_paper in sorted_papers}.values()
    )

    return sorted_dedup_papers[:top_n]

fetch_data(search_query, limit=100, year_range=None)

Fetches data from the SemanticScholar API. Args: search_query (str): The query to search for. limit (int, optional): The maximum number of results to return. Defaults to 100. year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None. Returns: list: A list of paper objects. Examples:

fetch_data("machine learning", limit=50, year_range=(2010, 2020)) [{...}, {...}, ...]

Source code in autoresearcher/data_sources/web_apis/semantic_scholar_loader.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def fetch_data(self, search_query, limit=100, year_range=None):
    """
    Fetches data from the SemanticScholar API.
    Args:
      search_query (str): The query to search for.
      limit (int, optional): The maximum number of results to return. Defaults to 100.
      year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None.
    Returns:
      list: A list of paper objects.
    Examples:
      >>> fetch_data("machine learning", limit=50, year_range=(2010, 2020))
      [{...}, {...}, ...]
    """
    params = {
        "query": search_query,
        "limit": limit,
        "fields": "title,url,abstract,authors,citationStyles,journal,citationCount,year,externalIds",
    }

    if year_range is not None:
        params["year"] = year_range

    data = self.make_request("", params=params)
    return data.get("data", [])

BaseWebAPIDataLoader

Bases: ABC

Source code in autoresearcher/data_sources/web_apis/base_web_api_data_loader.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class BaseWebAPIDataLoader(ABC):
    def __init__(self, base_url):
        self.base_url = base_url

    @abstractmethod
    def fetch_data(self, search_query, **kwargs):
        """
        Fetches data from the API.
        Args:
          search_query (str): The search query to use.
          **kwargs: Additional keyword arguments to pass to the API.
        Returns:
          dict: The response from the API.
        Raises:
          NotImplementedError: If the method is not implemented.
        """
        pass

    def make_request(self, endpoint, params=None):
        """
        Makes a request to the API.
        Args:
          endpoint (str): The API endpoint to make the request to.
          params (dict, optional): Additional parameters to pass to the API. Defaults to None.
        Returns:
          dict: The response from the API.
        Raises:
          Exception: If the request fails.
        """
        url = f"{self.base_url}{endpoint}"
        response = requests.get(url, params=params)

        if response.status_code == 200:
            data = response.json()
            return data
        else:
            raise Exception(f"Failed to fetch data from API: {response.status_code}")

fetch_data(search_query, **kwargs) abstractmethod

Fetches data from the API. Args: search_query (str): The search query to use. **kwargs: Additional keyword arguments to pass to the API. Returns: dict: The response from the API. Raises: NotImplementedError: If the method is not implemented.

Source code in autoresearcher/data_sources/web_apis/base_web_api_data_loader.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
@abstractmethod
def fetch_data(self, search_query, **kwargs):
    """
    Fetches data from the API.
    Args:
      search_query (str): The search query to use.
      **kwargs: Additional keyword arguments to pass to the API.
    Returns:
      dict: The response from the API.
    Raises:
      NotImplementedError: If the method is not implemented.
    """
    pass

make_request(endpoint, params=None)

Makes a request to the API. Args: endpoint (str): The API endpoint to make the request to. params (dict, optional): Additional parameters to pass to the API. Defaults to None. Returns: dict: The response from the API. Raises: Exception: If the request fails.

Source code in autoresearcher/data_sources/web_apis/base_web_api_data_loader.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def make_request(self, endpoint, params=None):
    """
    Makes a request to the API.
    Args:
      endpoint (str): The API endpoint to make the request to.
      params (dict, optional): Additional parameters to pass to the API. Defaults to None.
    Returns:
      dict: The response from the API.
    Raises:
      Exception: If the request fails.
    """
    url = f"{self.base_url}{endpoint}"
    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        return data
    else:
        raise Exception(f"Failed to fetch data from API: {response.status_code}")